|
| 1 | +"""Testing for Logistic regression """ |
| 2 | + |
| 3 | +# Authors: Daniel Emaasit <daniel.emaasit@gmail.com> |
| 4 | +# |
| 5 | +# License: BSD 3 clause |
| 6 | + |
| 7 | +import pytest |
| 8 | +import numpy.testing as npt |
| 9 | +import pandas.testing as pdt |
1 | 10 | import shutil
|
2 | 11 | import tempfile
|
3 |
| -import unittest |
4 | 12 |
|
5 | 13 | import numpy as np
|
6 |
| -import pandas as pd |
7 | 14 | from pymc3 import summary
|
8 | 15 | from sklearn.model_selection import train_test_split
|
9 | 16 |
|
10 | 17 | from pmlearn.exceptions import NotFittedError
|
11 | 18 | from pmlearn.linear_model import HierarchicalLogisticRegression
|
12 | 19 |
|
13 | 20 |
|
14 |
| -class HierarchicalLogisticRegressionTestCase(unittest.TestCase): |
15 |
| - def setUp(self): |
| 21 | +class TestHierarchicalLogisticRegression(object): |
| 22 | + def setup_method(self): |
16 | 23 | def numpy_invlogit(x):
|
17 | 24 | return 1 / (1 + np.exp(-x))
|
18 | 25 |
|
19 | 26 | self.num_cats = 3
|
20 | 27 | self.num_pred = 1
|
21 |
| - self.num_samples_per_cat = 100000 |
| 28 | + self.num_samples_per_cat = 1000 |
22 | 29 |
|
23 | 30 | self.alphas = np.random.randn(self.num_cats)
|
24 | 31 | self.betas = np.random.randn(self.num_cats, self.num_pred)
|
25 |
| - #TODO: make this more efficient; right now, it's very explicit so I understand it. |
| 32 | + # TODO: make this more efficient; right now, it's very explicit |
| 33 | + # so I understand it. |
26 | 34 | x_a = np.random.randn(self.num_samples_per_cat, self.num_pred)
|
27 |
| - y_a = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas[0] * x_a, 1))) |
| 35 | + y_a = np.random.binomial(1, |
| 36 | + numpy_invlogit(self.alphas[0] + |
| 37 | + np.sum(self.betas[0] * x_a, 1) |
| 38 | + )) |
28 | 39 | x_b = np.random.randn(self.num_samples_per_cat, self.num_pred)
|
29 |
| - y_b = np.random.binomial(1, numpy_invlogit(self.alphas[1] + np.sum(self.betas[1] * x_b, 1))) |
| 40 | + y_b = np.random.binomial(1, |
| 41 | + numpy_invlogit(self.alphas[1] + |
| 42 | + np.sum(self.betas[1] * x_b, 1) |
| 43 | + )) |
30 | 44 | x_c = np.random.randn(self.num_samples_per_cat, self.num_pred)
|
31 |
| - y_c = np.random.binomial(1, numpy_invlogit(self.alphas[2] + np.sum(self.betas[2] * x_c, 1))) |
| 45 | + y_c = np.random.binomial(1, |
| 46 | + numpy_invlogit(self.alphas[2] + |
| 47 | + np.sum(self.betas[2] * x_c, 1) |
| 48 | + )) |
32 | 49 |
|
33 | 50 | X = np.concatenate([x_a, x_b, x_c])
|
34 |
| - Y = np.concatenate([y_a, y_b, y_c]) |
| 51 | + y = np.concatenate([y_a, y_b, y_c]) |
35 | 52 | cats = np.concatenate([
|
36 | 53 | np.zeros(self.num_samples_per_cat, dtype=np.int),
|
37 | 54 | np.ones(self.num_samples_per_cat, dtype=np.int),
|
38 | 55 | 2*np.ones(self.num_samples_per_cat, dtype=np.int)
|
39 | 56 | ])
|
40 | 57 |
|
41 |
| - self.X_train, self.X_test, self.cat_train, self.cat_test, self.Y_train, self.Y_test = train_test_split( |
42 |
| - X, cats, Y, test_size=0.4 |
| 58 | + self.X_train, self.X_test, self.cat_train, self.cat_test, \ |
| 59 | + self.y_train, self.y_test = train_test_split( |
| 60 | + X, cats, y, test_size=0.4 |
43 | 61 | )
|
44 | 62 |
|
45 |
| - self.test_HLR = HierarchicalLogisticRegression() |
| 63 | + self.advi_hlr = HierarchicalLogisticRegression() |
46 | 64 |
|
47 | 65 | self.test_dir = tempfile.mkdtemp()
|
48 | 66 |
|
49 |
| - def tearDown(self): |
| 67 | + def teardown_method(self): |
50 | 68 | shutil.rmtree(self.test_dir)
|
51 | 69 |
|
52 | 70 |
|
53 |
| -class HierarchicalLogisticRegressionFitTestCase(HierarchicalLogisticRegressionTestCase): |
54 |
| - def test_fit_returns_correct_model(self): |
| 71 | +class TestHierarchicalLogisticRegressionFit(TestHierarchicalLogisticRegression): |
| 72 | + def test_advi_fit_returns_correct_model(self): |
55 | 73 | # Note: print is here so PyMC3 output won't overwrite the test name
|
56 | 74 | print('')
|
57 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
| 75 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 76 | + minibatch_size=500, inference_args={"n": 50000}) |
58 | 77 |
|
59 |
| - self.assertEqual(self.num_cats, self.test_HLR.num_cats) |
60 |
| - self.assertEqual(self.num_pred, self.test_HLR.num_pred) |
| 78 | + npt.assert_equal(self.num_cats, self.advi_hlr.num_cats) |
| 79 | + npt.assert_equal(self.num_pred, self.advi_hlr.num_pred) |
61 | 80 |
|
62 | 81 | #TODO: Figure out best way to test
|
63 |
| - #np.testing.assert_almost_equal(self.alphas, self.test_HLR.trace['alphas'].mean(), decimal=1) |
64 |
| - #np.testing.assert_almost_equal(self.betas, self.test_HLR.trace['betas'].mean(), decimal=1) |
65 |
| - |
66 |
| - # For now, just check that the estimated parameters have the correct signs |
67 |
| - np.testing.assert_equal( |
| 82 | + #np.testing.assert_almost_equal(self.alphas, |
| 83 | + # self.advi_hlr.trace['alphas'].mean(), decimal=1) |
| 84 | + #np.testing.assert_almost_equal(self.betas, |
| 85 | + # self.advi_hlr.trace['betas'].mean(), decimal=1) |
| 86 | + |
| 87 | + # For now, just check that the estimated parameters |
| 88 | + # have the correct signs |
| 89 | + npt.assert_equal( |
68 | 90 | np.sign(self.alphas),
|
69 |
| - np.sign(self.test_HLR.trace['alpha'].mean(axis=0)) |
| 91 | + np.sign(self.advi_hlr.trace['alpha'].mean(axis=0)) |
70 | 92 | )
|
71 |
| - np.testing.assert_equal( |
| 93 | + npt.assert_equal( |
72 | 94 | np.sign(self.betas),
|
73 |
| - np.sign(self.test_HLR.trace['beta'].mean(axis=0)) |
| 95 | + np.sign(self.advi_hlr.trace['beta'].mean(axis=0)) |
74 | 96 | )
|
75 | 97 |
|
76 | 98 |
|
77 |
| -class HierarchicalLogisticRegressionPredictProbaTestCase(HierarchicalLogisticRegressionTestCase): |
| 99 | +class TestHierarchicalLogisticRegressionPredictProba( |
| 100 | + TestHierarchicalLogisticRegression): |
| 101 | + |
78 | 102 | def test_predict_proba_returns_probabilities(self):
|
79 | 103 | print('')
|
80 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
81 |
| - probs = self.test_HLR.predict_proba(self.X_test, self.cat_test) |
82 |
| - self.assertEqual(probs.shape, self.Y_test.shape) |
| 104 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 105 | + minibatch_size=500, inference_args={"n": 50000}) |
| 106 | + probs = self.advi_hlr.predict_proba(self.X_test, self.cat_test) |
| 107 | + npt.assert_equal(probs.shape, self.y_test.shape) |
83 | 108 |
|
84 | 109 | def test_predict_proba_returns_probabilities_and_std(self):
|
85 | 110 | print('')
|
86 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
87 |
| - probs, stds = self.test_HLR.predict_proba(self.X_test, self.cat_test, return_std=True) |
88 |
| - self.assertEqual(probs.shape, self.Y_test.shape) |
89 |
| - self.assertEqual(stds.shape, self.Y_test.shape) |
| 111 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 112 | + minibatch_size=500, inference_args={"n": 50000}) |
| 113 | + probs, stds = self.advi_hlr.predict_proba(self.X_test, self.cat_test, |
| 114 | + return_std=True) |
| 115 | + npt.assert_equal(probs.shape, self.y_test.shape) |
| 116 | + npt.assert_equal(stds.shape, self.y_test.shape) |
90 | 117 |
|
91 | 118 | def test_predict_proba_raises_error_if_not_fit(self):
|
92 |
| - with self.assertRaises(NotFittedError) as no_fit_error: |
93 |
| - test_HLR = HierarchicalLogisticRegression() |
94 |
| - test_HLR.predict_proba(self.X_train, self.cat_train) |
| 119 | + with pytest.raises(NotFittedError): |
| 120 | + advi_hlr = HierarchicalLogisticRegression() |
| 121 | + advi_hlr.predict_proba(self.X_train, self.cat_train) |
95 | 122 |
|
96 |
| - expected = 'Run fit on the model before predict.' |
97 |
| - self.assertEqual(str(no_fit_error.exception), expected) |
98 | 123 |
|
| 124 | +class TestHierarchicalLogisticRegressionPredict( |
| 125 | + TestHierarchicalLogisticRegression): |
99 | 126 |
|
100 |
| -class HierarchicalLogisticRegressionPredictTestCase(HierarchicalLogisticRegressionTestCase): |
101 | 127 | def test_predict_returns_predictions(self):
|
102 | 128 | print('')
|
103 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
104 |
| - preds = self.test_HLR.predict(self.X_test, self.cat_test) |
105 |
| - self.assertEqual(preds.shape, self.Y_test.shape) |
| 129 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 130 | + minibatch_size=500, inference_args={"n": 50000}) |
| 131 | + preds = self.advi_hlr.predict(self.X_test, self.cat_test) |
| 132 | + npt.assert_equal(preds.shape, self.y_test.shape) |
| 133 | + |
106 | 134 |
|
| 135 | +class TestHierarchicalLogisticRegressionScore( |
| 136 | + TestHierarchicalLogisticRegression): |
107 | 137 |
|
108 |
| -class HierarchicalLogisticRegressionScoreTestCase(HierarchicalLogisticRegressionTestCase): |
109 | 138 | def test_score_scores(self):
|
110 | 139 | print('')
|
111 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
112 |
| - score = self.test_HLR.score(self.X_test, self.Y_test, self.cat_test) |
113 |
| - naive_score = np.mean(self.Y_test) |
114 |
| - self.assertGreaterEqual(score, naive_score) |
| 140 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 141 | + minibatch_size=500, inference_args={"n": 50000}) |
| 142 | + score = self.advi_hlr.score(self.X_test, self.y_test, self.cat_test) |
| 143 | + naive_score = np.mean(self.y_test) |
| 144 | + npt.assert_array_less(naive_score, score) |
| 145 | + |
115 | 146 |
|
| 147 | +class TestHierarchicalLogisticRegressionSaveandLoad( |
| 148 | + TestHierarchicalLogisticRegression): |
116 | 149 |
|
117 |
| -class HierarchicalLogisticRegressionSaveandLoadTestCase(HierarchicalLogisticRegressionTestCase): |
118 | 150 | def test_save_and_load_work_correctly(self):
|
119 | 151 | print('')
|
120 |
| - self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train, minibatch_size=2000) |
121 |
| - probs1 = self.test_HLR.predict_proba(self.X_test, self.cat_test) |
122 |
| - self.test_HLR.save(self.test_dir) |
| 152 | + self.advi_hlr.fit(self.X_train, self.y_train, self.cat_train, |
| 153 | + minibatch_size=500, inference_args={"n": 50000}) |
| 154 | + probs1 = self.advi_hlr.predict_proba(self.X_test, self.cat_test) |
| 155 | + self.advi_hlr.save(self.test_dir) |
123 | 156 |
|
124 |
| - HLR2 = HierarchicalLogisticRegression() |
| 157 | + hlr2 = HierarchicalLogisticRegression() |
125 | 158 |
|
126 |
| - HLR2.load(self.test_dir) |
| 159 | + hlr2.load(self.test_dir) |
127 | 160 |
|
128 |
| - self.assertEqual(self.test_HLR.num_cats, HLR2.num_cats) |
129 |
| - self.assertEqual(self.test_HLR.num_pred, HLR2.num_pred) |
130 |
| - self.assertEqual(self.test_HLR.num_training_samples, HLR2.num_training_samples) |
131 |
| - pd.testing.assert_frame_equal(summary(self.test_HLR.trace), summary(HLR2.trace)) |
| 161 | + npt.assert_equal(self.advi_hlr.num_cats, hlr2.num_cats) |
| 162 | + npt.assert_equal(self.advi_hlr.num_pred, hlr2.num_pred) |
| 163 | + npt.assert_equal(self.advi_hlr.num_training_samples, |
| 164 | + hlr2.num_training_samples) |
| 165 | + pdt.assert_frame_equal(summary(self.advi_hlr.trace), |
| 166 | + summary(hlr2.trace)) |
132 | 167 |
|
133 |
| - probs2 = HLR2.predict_proba(self.X_test, self.cat_test) |
| 168 | + probs2 = hlr2.predict_proba(self.X_test, self.cat_test) |
134 | 169 |
|
135 |
| - np.testing.assert_almost_equal(probs2, probs1, decimal=1) |
| 170 | + npt.assert_almost_equal(probs2, probs1, decimal=1) |
0 commit comments