From ae885db17320662cc8c0e940d8e27a30dc105eb1 Mon Sep 17 00:00:00 2001
From: dukebody <israel.saeta@dukebody.com>
Date: Tue, 11 Aug 2015 20:36:33 +0200
Subject: [PATCH 1/3] If any of the extracted features is sparse, make the
 hstacked result sparse as well.

---
 sklearn_pandas/__init__.py     | 16 ++++++++++------
 tests/test_dataframe_mapper.py | 17 ++++++++++++++++-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
index edfc0bf..074b1d9 100644
--- a/sklearn_pandas/__init__.py
+++ b/sklearn_pandas/__init__.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy import sparse
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn import cross_validation
 from sklearn import grid_search
@@ -55,11 +56,7 @@ def transform(self, X):
 
 
 def _handle_feature(fea):
-    if hasattr(fea, 'toarray'):
-        # sparse arrays should be converted to regular arrays
-        # for hstack.
-        fea = fea.toarray()
-
+    # convert 1-dimensional arrays to 2-dimensional column vectors
     if len(fea.shape) == 1:
         fea = np.array([fea]).T
 
@@ -156,4 +153,11 @@ def transform(self, X):
         # at this point we lose track of which features
         # were created from which input columns, so it's
         # assumed that that doesn't matter to the model.
-        return np.hstack(extracted)
+
+        # If any of the extracted features is sparse, combine to produce a
+        # sparse matrix. Otherwise, produce a dense one.
+        if any(sparse.issparse(fea) for fea in extracted):
+            stacked = sparse.hstack(extracted).tocsr()
+        else:
+            stacked = np.hstack(extracted)
+        return stacked
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
index 5d90dae..f914af9 100644
--- a/tests/test_dataframe_mapper.py
+++ b/tests/test_dataframe_mapper.py
@@ -9,11 +9,12 @@
 
 from pandas import DataFrame
 import pandas as pd
+from scipy import sparse
 from sklearn.datasets import load_iris
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.preprocessing import Imputer, StandardScaler
+from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer
 import numpy as np
 
 from sklearn_pandas import (
@@ -140,3 +141,17 @@ def test_list_transformers():
     # all features have mean 0 and std deviation 1 (standardized)
     assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
     assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
+
+
+def test_sparse_features(cars_dataframe):
+    """
+    If any of the extracted features is sparse, the hstacked
+    is also sparse.
+    """
+    mapper = DataFrameMapper([
+        ("description", CountVectorizer()),  # sparse feature
+        ("model", LabelBinarizer()),  # dense feature
+    ])
+    dmatrix = mapper.fit_transform(cars_dataframe)
+
+    assert type(dmatrix) == sparse.csr.csr_matrix

From 766f95571c7ac08d0f4be61d160b932c34cacdcd Mon Sep 17 00:00:00 2001
From: dukebody <israel.saeta@dukebody.com>
Date: Sun, 30 Aug 2015 15:01:45 +0200
Subject: [PATCH 2/3] Sparse output only if sparse=True. With docs.

---
 README.rst                     | 15 +++++++++
 sklearn_pandas/__init__.py     | 14 ++++++--
 tests/test_dataframe_mapper.py | 59 ++++++++++++++++++++++++++--------
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/README.rst b/README.rst
index 110556c..c0bf2b1 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,7 @@ For these examples, we'll also use pandas, numpy, and sklearn::
     >>> import numpy as np
     >>> import sklearn.preprocessing, sklearn.decomposition, \
     ...     sklearn.linear_model, sklearn.pipeline, sklearn.metrics
+    >>> from sklearn.feature_extraction.text import CountVectorizer
 
 Load some Data
 **************
@@ -156,6 +157,20 @@ Only columns that are listed in the DataFrameMapper are kept. To keep a column b
            [ 1.,  0.,  0.,  5.],
            [ 0.,  0.,  1.,  4.]])
 
+
+Working with sparse features
+****************************
+
+`DataFrameMapper`s will return a dense feature array by default. Setting `sparse=True` in the mapper will return a sparse array whenever any of the extracted features is sparse. Example:
+
+    >>> mapper4 = DataFrameMapper([
+    ...     ('pet', CountVectorizer()),
+    ... ], sparse=True)
+    >>> type(mapper4.fit_transform(data))
+    <class 'scipy.sparse.csr.csr_matrix'>
+
+The stacking of the sparse features is done without ever densifying them.
+
 Cross-Validation
 ----------------
 
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
index 074b1d9..4324684 100644
--- a/sklearn_pandas/__init__.py
+++ b/sklearn_pandas/__init__.py
@@ -69,7 +69,7 @@ class DataFrameMapper(BaseEstimator, TransformerMixin):
     sklearn transformation.
     """
 
-    def __init__(self, features):
+    def __init__(self, features, sparse=False):
         """
         Params:
 
@@ -77,8 +77,11 @@ def __init__(self, features):
                     selector. This can be a string (for one column) or a list
                     of strings. The second element is an object that supports
                     sklearn's transform interface.
+        sparse      will return sparse matrix if set True and any of the
+                    extracted features is sparse. Defaults to False.
         """
         self.features = features
+        self.sparse = sparse
 
     def _get_col_subset(self, X, cols):
         """
@@ -154,10 +157,15 @@ def transform(self, X):
         # were created from which input columns, so it's
         # assumed that that doesn't matter to the model.
 
-        # If any of the extracted features is sparse, combine to produce a
-        # sparse matrix. Otherwise, produce a dense one.
+        # If any of the extracted features is sparse, combine sparsely.
+        # Otherwise, combine as normal arrays.
         if any(sparse.issparse(fea) for fea in extracted):
             stacked = sparse.hstack(extracted).tocsr()
+            # return a sparse matrix only if the mapper was initialized
+            # with sparse=True
+            if not self.sparse:
+                stacked = stacked.toarray()
         else:
             stacked = np.hstack(extracted)
+
         return stacked
diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
index f914af9..4588369 100644
--- a/tests/test_dataframe_mapper.py
+++ b/tests/test_dataframe_mapper.py
@@ -14,7 +14,8 @@
 from sklearn.pipeline import Pipeline
 from sklearn.svm import SVC
 from sklearn.feature_extraction.text import CountVectorizer
-from sklearn.preprocessing import Imputer, StandardScaler, LabelBinarizer
+from sklearn.preprocessing import Imputer, StandardScaler
+from sklearn.base import BaseEstimator, TransformerMixin
 import numpy as np
 
 from sklearn_pandas import (
@@ -24,6 +25,17 @@
 )
 
 
+class ToSparseTransformer(BaseEstimator, TransformerMixin):
+    """
+    Transforms numpy matrix to sparse format.
+    """
+    def fit(self, X):
+        return self
+
+    def transform(self, X):
+        return sparse.csr_matrix(X)
+
+
 @pytest.fixture
 def iris_dataframe():
     iris = load_iris()
@@ -43,6 +55,11 @@ def cars_dataframe():
     return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
 
 
+@pytest.fixture
+def simple_dataframe():
+    return pd.DataFrame({'a': [1, 2, 3]})
+
+
 def test_nonexistent_columns_explicit_fail(iris_dataframe):
     """
     If a nonexistent column is selected, KeyError is raised.
@@ -93,32 +110,32 @@ def test_with_car_dataframe(cars_dataframe):
     assert scores.mean() > 0.30
 
 
-def test_cols_string_array():
+def test_cols_string_array(simple_dataframe):
     """
     If an string specified as the columns, the transformer
     is called with a 1-d array as input.
     """
-    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    df = simple_dataframe
     mock_transformer = Mock()
     mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
     mapper = DataFrameMapper([("a", mock_transformer)])
 
-    mapper.fit_transform(dataframe)
+    mapper.fit_transform(df)
     args, kwargs = mock_transformer.fit.call_args
     assert args[0].shape == (3,)
 
 
-def test_cols_list_column_vector():
+def test_cols_list_column_vector(simple_dataframe):
     """
     If a one-element list is specified as the columns, the transformer
     is called with a column vector as input.
     """
-    dataframe = pd.DataFrame({"a": [1, 2, 3]})
+    df = simple_dataframe
     mock_transformer = Mock()
     mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
     mapper = DataFrameMapper([(["a"], mock_transformer)])
 
-    mapper.fit_transform(dataframe)
+    mapper.fit_transform(df)
     args, kwargs = mock_transformer.fit.call_args
     assert args[0].shape == (3, 1)
 
@@ -143,15 +160,29 @@ def test_list_transformers():
     assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
 
 
-def test_sparse_features(cars_dataframe):
+def test_sparse_features(simple_dataframe):
     """
-    If any of the extracted features is sparse, the hstacked
-    is also sparse.
+    If any of the extracted features is sparse and "sparse" argument
+    is true, the hstacked result is also sparse.
     """
+    df = simple_dataframe
     mapper = DataFrameMapper([
-        ("description", CountVectorizer()),  # sparse feature
-        ("model", LabelBinarizer()),  # dense feature
-    ])
-    dmatrix = mapper.fit_transform(cars_dataframe)
+        ("a", ToSparseTransformer())
+    ], sparse=True)
+    dmatrix = mapper.fit_transform(df)
 
     assert type(dmatrix) == sparse.csr.csr_matrix
+
+
+def test_sparse_off(simple_dataframe):
+    """
+    If the resulting features are sparse but the "sparse" argument
+    of the mapper is False, return a non-sparse matrix.
+    """
+    df = simple_dataframe
+    mapper = DataFrameMapper([
+        ("a", ToSparseTransformer())
+    ], sparse=False)
+
+    dmatrix = mapper.fit_transform(df)
+    assert type(dmatrix) != sparse.csr.csr_matrix

From 1534015078208d070ce79d23617d8f4ba5746b90 Mon Sep 17 00:00:00 2001
From: dukebody <israel.saeta@dukebody.com>
Date: Sat, 7 Nov 2015 20:51:57 +0100
Subject: [PATCH 3/3] Update changelog.

---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index c0bf2b1..70b1095 100644
--- a/README.rst
+++ b/README.rst
@@ -194,6 +194,7 @@ Changelog
 ********************
 
 * Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30.
+* Return sparse feature array if any of the features is sparse and `sparse` argument is `True`. Defaults to `False` to avoid potential breaking of existing code. Resolves #34.
 
 
 0.0.12 (2015-11-07)