[med-svn] [Git][med-team/q2-sample-classifier][upstream] New upstream version 2020.11.0
Steffen Möller
gitlab at salsa.debian.org
Wed Dec 2 13:39:34 GMT 2020
Steffen Möller pushed to branch upstream at Debian Med / q2-sample-classifier
Commits:
31dd5bad by Steffen Moeller at 2020-12-02T13:34:25+01:00
New upstream version 2020.11.0
- - - - -
25 changed files:
- .gitignore
- LICENSE
- ci/recipe/meta.yaml
- q2_sample_classifier/__init__.py
- q2_sample_classifier/_format.py
- q2_sample_classifier/_transformer.py
- q2_sample_classifier/_type.py
- q2_sample_classifier/_version.py
- q2_sample_classifier/classify.py
- q2_sample_classifier/plugin_setup.py
- q2_sample_classifier/tests/__init__.py
- q2_sample_classifier/tests/data/ecam_map_maturity.txt
- + q2_sample_classifier/tests/data/vaw.qza
- + q2_sample_classifier/tests/data/vaw.txt
- + q2_sample_classifier/tests/data/vaw_importance.tsv
- + q2_sample_classifier/tests/test_actions.py
- + q2_sample_classifier/tests/test_base_class.py
- q2_sample_classifier/tests/test_classifier.py
- + q2_sample_classifier/tests/test_estimators.py
- + q2_sample_classifier/tests/test_types_formats_transformers.py
- + q2_sample_classifier/tests/test_utilities.py
- + q2_sample_classifier/tests/test_visualization.py
- q2_sample_classifier/utilities.py
- q2_sample_classifier/visuals.py
- setup.py
Changes:
=====================================
.gitignore
=====================================
@@ -69,5 +69,8 @@ target/
# other
*~
+*.code-workspace
+.vscode/
+
.DS_store
=====================================
LICENSE
=====================================
@@ -1,6 +1,6 @@
BSD 3-Clause License
-Copyright (c) 2017-2019, QIIME 2 development team.
+Copyright (c) 2017-2020, QIIME 2 development team.
All rights reserved.
Redistribution and use in source and binary forms, with or without
=====================================
ci/recipe/meta.yaml
=====================================
@@ -19,10 +19,10 @@ requirements:
run:
- python {{ python }}
- - pandas
+ - pandas >=1
- scipy
- joblib
- - scikit-learn
+ - scikit-learn >=0.22.1
- scikit-bio
- seaborn >=0.8
- fastcluster
=====================================
q2_sample_classifier/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_sample_classifier/_format.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_sample_classifier/_transformer.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_sample_classifier/_type.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_sample_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2019.10.0)"
- git_full = "b4dda7c2b674409f29fbf78d648cb5aa949930f9"
- git_date = "2019-11-01 01:04:43 +0000"
+ git_refnames = " (HEAD -> master, tag: 2020.11.0)"
+ git_full = "ceca1ef65671ec878e973e5f31b23e81a8feba6f"
+ git_date = "2020-11-25 17:13:13 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_sample_classifier/classify.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -371,6 +371,8 @@ def confusion_matrix(output_dir: str,
if vmax == 'auto':
vmax = None
+ predictions = predictions.astype(str)
+
_plot_accuracy(output_dir, predictions, truth, probabilities,
missing_samples=missing_samples,
classification=True, palette=palette,
@@ -467,13 +469,12 @@ def detect_outliers(table: biom.Table,
X_train = features
# fit isolation tree
- # TODO: update to behavior='new' if we ever move this out of experimental
- estimator = Pipeline(
- [('dv', DictVectorizer()),
- ('est', IsolationForest(n_jobs=n_jobs, n_estimators=n_estimators,
- contamination=contamination,
- random_state=random_state,
- behaviour='old'))])
+ estimator = Pipeline([('dv', DictVectorizer()),
+ ('est', IsolationForest(n_jobs=n_jobs,
+ n_estimators=n_estimators,
+ contamination=contamination,
+ random_state=random_state,
+ ))])
estimator.fit(X_train)
# predict outlier status
=====================================
q2_sample_classifier/plugin_setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -13,7 +13,7 @@ from qiime2.plugin import (
Numeric, Categorical, Citations, Visualization, TypeMatch)
from q2_types.feature_table import (
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Balance,
- PercentileNormalized)
+ PercentileNormalized, Design)
from q2_types.sample_data import SampleData
from q2_types.feature_data import FeatureData
from q2_types.distance_matrix import DistanceMatrix
@@ -475,7 +475,7 @@ plugin.visualizers.register_function(
T = TypeMatch([Frequency, RelativeFrequency, PresenceAbsence, Balance,
- PercentileNormalized])
+ PercentileNormalized, Design])
plugin.methods.register_function(
function=split_table,
inputs={'table': FeatureTable[T]},
=====================================
q2_sample_classifier/tests/__init__.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
=====================================
q2_sample_classifier/tests/data/ecam_map_maturity.txt
=====================================
@@ -124,4 +124,4 @@
10249.C001.36SS y 855 Vaginal bd eb C 28 28.1 Vaginal.bd.y.1 Female 1 Vaginal.bd.y.1
10249.C005.22SS n 855 Cesarean fd fd C 28 28.1 Cesarean.fd.n.1 Female 5 Cesarean.fd.n.1
10249.C002.21SD n 856 Cesarean bd eb C 28 28.1 Cesarean.bd.n.1 Male 2 Cesarean.bd.n.1
-10249.C002.21SS n 856 Cesarean bd eb C 28 28.1 Cesarean.bd.n.1 Male 2 Cesarean.bd.n.1
\ No newline at end of file
+10249.C002.21SS n 856 Cesarean bd eb C 28 28.1 Cesarean.bd.n.1 Male 2 Cesarean.bd.n.1
=====================================
q2_sample_classifier/tests/data/vaw.qza
=====================================
Binary files /dev/null and b/q2_sample_classifier/tests/data/vaw.qza differ
=====================================
q2_sample_classifier/tests/data/vaw.txt
=====================================
@@ -0,0 +1,7 @@
+#SampleID Column
+Sample1 a
+Sample2 a
+Sample3 a
+Sample4 b
+Sample5 b
+Sample6 b
=====================================
q2_sample_classifier/tests/data/vaw_importance.tsv
=====================================
@@ -0,0 +1,6 @@
+ importance
+GG_OTU_1 0.084698283208355865
+GG_OTU_2 0.077601184175696976
+GG_OTU_3 0.065702517505059144
+GG_OTU_4 0.061718558716901406
+GG_OTU_5 0.028086160290024458
=====================================
q2_sample_classifier/tests/test_actions.py
=====================================
@@ -0,0 +1,160 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import numpy as np
+import biom
+
+import qiime2
+from qiime2.plugins import sample_classifier
+
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
+from q2_sample_classifier.tests.test_estimators import SampleEstimatorTestBase
+from q2_sample_classifier.classify import summarize
+
+
+class NowLetsTestTheActions(SampleClassifierTestPluginBase):
+
+ def setUp(self):
+ super().setUp()
+ md = pd.Series(['a', 'a', 'b', 'b', 'b'],
+ index=['a', 'b', 'c', 'd', 'e'], name='bugs')
+ md.index.name = 'SampleID'
+ self.md = qiime2.CategoricalMetadataColumn(md)
+
+ tab = biom.Table(
+ np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
+ [8, 6, 4, 1, 0], [8, 6, 4, 1, 0]]),
+ observation_ids=['v', 'w', 'x', 'y', 'z'],
+ sample_ids=['a', 'b', 'c', 'd', 'e'])
+ self.tab = qiime2.Artifact.import_data('FeatureTable[Frequency]', tab)
+
+ md2 = pd.DataFrame({'trash': ['a', 'a', 'b', 'b', 'b', 'junk'],
+ 'floats': [0.1, 0.1, 1.3, 1.8, 1000.1, 0.1],
+ 'ints': [0, 1, 2, 2, 2, 0],
+ 'nans': [1, 1, 2, 2, np.nan, np.nan],
+ 'negatives': [-7, -3, -1.2, -4, -9, -1]},
+ index=['a', 'b', 'c', 'd', 'e', 'peanut'])
+ md2.index.name = 'SampleID'
+ self.md2 = qiime2.Metadata(md2)
+
+ # let's make sure the correct transformers are in place! See issue 114
+ # if this runs without error, that's good enough for me. We already
+ # validate the function above.
+ def test_action_split_table(self):
+ sample_classifier.actions.split_table(self.tab, self.md, test_size=0.5)
+
+ def test_metatable(self):
+ exp = biom.Table(
+ np.array([[0.1, 0.1, 1.3, 1.8, 1000.1, 0.1],
+ [0, 1, 2, 2, 2, 0]]),
+ observation_ids=['floats', 'ints'],
+ sample_ids=['a', 'b', 'c', 'd', 'e', 'peanut'])
+ res, = sample_classifier.actions.metatable(
+ self.md2, missing_values='drop_features')
+ report = res.view(biom.Table).descriptive_equality(exp)
+ self.assertIn('Tables appear equal', report, report)
+
+ def test_metatable_missing_error(self):
+ with self.assertRaisesRegex(ValueError, "missing values"):
+ sample_classifier.actions.metatable(
+ self.md2, missing_values='error')
+
+ def test_metatable_drop_samples(self):
+ exp = biom.Table(
+ np.array([[3, 6, 7, 3], [3, 4, 5, 6], [8, 6, 4, 1],
+ [8, 6, 4, 1], [8, 6, 4, 1],
+ [0.1, 0.1, 1.3, 1.8],
+ [0, 1, 2, 2], [1, 1, 2, 2]]),
+ observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
+ 'nans'],
+ sample_ids=['a', 'b', 'c', 'd'])
+ res, = sample_classifier.actions.metatable(
+ self.md2, self.tab, missing_values='drop_samples')
+ report = res.view(biom.Table).descriptive_equality(exp)
+ self.assertIn('Tables appear equal', report, report)
+
+ def test_metatable_fill_na(self):
+ exp = biom.Table(
+ np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
+ [8, 6, 4, 1, 0], [8, 6, 4, 1, 0],
+ [0.1, 0.1, 1.3, 1.8, 1000.1],
+ [0, 1, 2, 2, 2], [1., 1., 2., 2., 0.]]),
+ observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
+ 'nans'],
+ sample_ids=['a', 'b', 'c', 'd', 'e'])
+ res, = sample_classifier.actions.metatable(
+ self.md2, self.tab, missing_values='fill')
+ report = res.view(biom.Table).descriptive_equality(exp)
+ self.assertIn('Tables appear equal', report, report)
+
+ def test_metatable_with_merge(self):
+ exp = biom.Table(
+ np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
+ [8, 6, 4, 1, 0], [8, 6, 4, 1, 0],
+ [0.1, 0.1, 1.3, 1.8, 1000.1],
+ [0, 1, 2, 2, 2]]),
+ observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints'],
+ sample_ids=['a', 'b', 'c', 'd', 'e'])
+ res, = sample_classifier.actions.metatable(
+ self.md2, self.tab, missing_values='drop_features')
+ report = res.view(biom.Table).descriptive_equality(exp)
+ self.assertIn('Tables appear equal', report, report)
+
+ def test_metatable_with_merge_successful_inner_join(self):
+ exp = biom.Table(
+ np.array([[3, 6, 7, 3], [3, 4, 5, 6], [8, 6, 4, 1],
+ [8, 6, 4, 1], [8, 6, 4, 1], [0.1, 0.1, 1.3, 1.8],
+ [0, 1, 2, 2], [1., 1., 2., 2.]]),
+ observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
+ 'nans'],
+ sample_ids=['a', 'b', 'c', 'd'])
+ res, = sample_classifier.actions.metatable(
+ self.md2.filter_ids(['a', 'b', 'c', 'd']), self.tab,
+ missing_values='error')
+ report = res.view(biom.Table).descriptive_equality(exp)
+ self.assertIn('Tables appear equal', report, report)
+
+ def test_metatable_with_merge_error_inner_join(self):
+ with self.assertRaisesRegex(ValueError, "Missing samples"):
+ sample_classifier.actions.metatable(
+ self.md2.filter_ids(['a', 'b', 'c', 'd']),
+ self.tab, missing_samples='error',
+ missing_values='drop_samples')
+
+ def test_metatable_empty_metadata_after_drop_all_unique(self):
+ with self.assertRaisesRegex(
+ ValueError, "All metadata"): # are belong to us
+ sample_classifier.actions.metatable(
+ self.md2.filter_ids(['b', 'c']), self.tab,
+ missing_values='drop_samples', drop_all_unique=True)
+
+ def test_metatable_no_samples_after_filtering(self):
+ junk_md = pd.DataFrame(
+ {'trash': ['a', 'a', 'b', 'b', 'b', 'junk'],
+ 'floats': [np.nan, np.nan, np.nan, 1.8, 1000.1, 0.1],
+ 'ints': [0, 1, 2, np.nan, 2, 0],
+ 'nans': [1, 1, 2, 2, np.nan, np.nan],
+ 'negatives': [-7, -4, -1.2, -4, -9, -1]},
+ index=['a', 'b', 'c', 'd', 'e', 'peanut'])
+ junk_md.index.name = 'SampleID'
+ junk_md = qiime2.Metadata(junk_md)
+ with self.assertRaisesRegex(ValueError, "All metadata samples"):
+ sample_classifier.actions.metatable(
+ junk_md, missing_values='drop_samples')
+
+
+# make sure summarize visualizer works and that rfe_scores are stored properly
+class TestSummarize(SampleEstimatorTestBase):
+
+ def test_summary_with_rfecv(self):
+ summarize(self.temp_dir.name, self.pipeline)
+
+ def test_summary_without_rfecv(self):
+ del self.pipeline.rfe_scores
+ summarize(self.temp_dir.name, self.pipeline)
=====================================
q2_sample_classifier/tests/test_base_class.py
=====================================
@@ -0,0 +1,27 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import tempfile
+import pkg_resources
+
+from qiime2.plugin.testing import TestPluginBase
+
+
+class SampleClassifierTestPluginBase(TestPluginBase):
+ package = 'q2_sample_classifier.tests'
+
+ def setUp(self):
+ super().setUp()
+ self.temp_dir = tempfile.TemporaryDirectory(
+ prefix='q2-sample-classifier-test-temp-')
+
+ def tearDown(self):
+ self.temp_dir.cleanup()
+
+ def get_data_path(self, filename):
+ return pkg_resources.resource_filename(self.package,
+ 'data/%s' % filename)
=====================================
q2_sample_classifier/tests/test_classifier.py
=====================================
@@ -1,216 +1,31 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
-
-import os
-from os import mkdir
-from os.path import join
from warnings import filterwarnings
-import tempfile
-import shutil
-import json
-import tarfile
-
-import qiime2
import pandas as pd
import numpy as np
-from sklearn.exceptions import ConvergenceWarning
-from q2_sample_classifier.visuals import (
- _linear_regress, _calculate_baseline_accuracy, _custom_palettes,
- _plot_heatmap_from_confusion_matrix, _add_sample_size_to_xtick_labels,
- _roc_palette, _roc_per_class, _roc_micro_average, _roc_macro_average,
- _binarize_labels, _generate_roc_plots)
-from q2_sample_classifier.classify import (
- regress_samples_ncv, classify_samples_ncv, fit_classifier, fit_regressor,
- detect_outliers, split_table, predict_classification,
- predict_regression, scatterplot, confusion_matrix, summarize)
-from q2_sample_classifier.utilities import (
- _set_parameters_and_estimator, _load_data,
- _calculate_feature_importances, _extract_important_features,
- _train_adaboost_base_estimator, _disable_feature_selection,
- _mean_feature_importance, _null_feature_importance, _extract_features,
- _match_series_or_die, _extract_rfe_scores, _predict_and_plot)
-from q2_sample_classifier import (
- BooleanSeriesFormat, BooleanSeriesDirectoryFormat, BooleanSeries,
- PredictionsFormat, PredictionsDirectoryFormat, ClassifierPredictions,
- RegressorPredictions, ImportanceFormat, ImportanceDirectoryFormat,
- Importance, SampleEstimatorDirFmt, PickleFormat, SampleEstimator,
- Classifier, Regressor, ProbabilitiesFormat, ProbabilitiesDirectoryFormat,
- Probabilities)
-from q2_sample_classifier._format import JSONFormat
-from q2_types.sample_data import SampleData
-from q2_types.feature_data import FeatureData
-from q2_types.feature_table import (FeatureTable, PercentileNormalized)
-import pkg_resources
-from qiime2.plugin.testing import TestPluginBase
-from qiime2.plugin import ValidationError
-from qiime2.plugins import sample_classifier, feature_table
-import sklearn
-from sklearn.metrics import mean_squared_error, accuracy_score
-from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
-from sklearn.svm import LinearSVC
-from sklearn.feature_extraction import DictVectorizer
+from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
-from sklearn.pipeline import Pipeline
-import joblib
import pandas.util.testing as pdt
import biom
-import skbio
-
-
-filterwarnings("ignore", category=UserWarning)
-filterwarnings("ignore", category=Warning)
-filterwarnings("ignore", category=ConvergenceWarning)
-
-
-class SampleClassifierTestPluginBase(TestPluginBase):
- package = 'q2_sample_classifier.tests'
-
- def setUp(self):
- super().setUp()
- self.temp_dir = tempfile.TemporaryDirectory(
- prefix='q2-sample-classifier-test-temp-')
-
- def tearDown(self):
- self.temp_dir.cleanup()
-
- def get_data_path(self, filename):
- return pkg_resources.resource_filename(self.package,
- 'data/%s' % filename)
-
-
-class UtilitiesTests(SampleClassifierTestPluginBase):
-
- def setUp(self):
- super().setUp()
-
- exp_rf = pd.DataFrame(
- {'importance': [0.1, 0.2, 0.3]}, index=['a', 'b', 'c'])
- exp_rf.index.name = 'feature'
- self.exp_rf = exp_rf
-
- exp_svm = pd.DataFrame(
- {'importance0': [0.1, 0.2, 0.3], 'importance1': [0.4, 0.5, 0.6]},
- index=['a', 'b', 'c'])
- exp_svm.index.name = 'feature'
- self.exp_svm = exp_svm
-
- exp_lsvm = pd.DataFrame(
- {'importance0': [-0.048794, -0.048794, -0.048794]},
- index=['a', 'b', 'c'])
- exp_lsvm.index.name = 'feature'
- self.exp_lsvm = exp_lsvm
-
- self.features = biom.Table(np.array([[1]*5]*3), ['a', 'b', 'c'],
- list(map(str, range(5))))
-
- self.targets = pd.Series(['a', 'a', 'b', 'b', 'a'], name='bullseye')
-
- def test_extract_important_features_1d_array(self):
- importances = _extract_important_features(
- self.features.ids('observation'),
- np.ndarray((3,), buffer=np.array([0.1, 0.2, 0.3])))
- self.assertEqual(sorted(self.exp_rf), sorted(importances))
-
- def test_extract_important_features_2d_array(self):
- importances = _extract_important_features(
- self.features.ids('observation'),
- np.ndarray(
- (2, 3), buffer=np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])))
- self.assertEqual(sorted(self.exp_svm), sorted(importances))
- # test feature importance calculation with main classifier types
- def test_calculate_feature_importances_ensemble(self):
- estimator = Pipeline(
- [('dv', DictVectorizer()),
- ('est', RandomForestClassifier(n_estimators=10))])
- estimator.fit(_extract_features(self.features),
- self.targets.values.ravel())
- fi = _calculate_feature_importances(estimator)
- self.assertEqual(sorted(self.exp_rf), sorted(fi))
-
- def test_calculate_feature_importances_svm(self):
- estimator = Pipeline(
- [('dv', DictVectorizer()), ('est', LinearSVC())])
- estimator.fit(_extract_features(self.features),
- self.targets.values.ravel())
- fi = _calculate_feature_importances(estimator)
- self.assertEqual(sorted(self.exp_lsvm), sorted(fi))
-
- # confirm that feature selection incompatibility warnings work
- def test_disable_feature_selection_unsupported(self):
- with self.assertWarnsRegex(UserWarning, "does not support recursive"):
- _disable_feature_selection('KNeighborsClassifier', False)
-
- def test_mean_feature_importance_1d_arrays(self):
- exp = pd.DataFrame([10, 9, 8, 7], columns=["importance0"],
- index=[3, 2, 1, 0])
- imps = [pd.DataFrame([1, 2, 3, 4], columns=["importance0"]),
- pd.DataFrame([5, 6, 7, 8], columns=["importance0"]),
- pd.DataFrame([9, 10, 11, 12], columns=["importance0"]),
- pd.DataFrame([13, 14, 15, 16], columns=["importance0"])]
- pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
-
- def test_mean_feature_importance_different_column_names(self):
- exp = pd.DataFrame([[6, 5, 4, 3], [14, 13, 12, 11]],
- index=["importance0", "importance1"],
- columns=[3, 2, 1, 0]).T
- imps = [pd.DataFrame([1, 2, 3, 4], columns=["importance0"]),
- pd.DataFrame([5, 6, 7, 8], columns=["importance0"]),
- pd.DataFrame([9, 10, 11, 12], columns=["importance1"]),
- pd.DataFrame([13, 14, 15, 16], columns=["importance1"])]
- pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
-
- def test_mean_feature_importance_2d_arrays(self):
- exp = pd.DataFrame([[3.5] * 4, [9.5] * 4],
- index=["importance0", "importance1"],
- columns=[3, 2, 1, 0]).T
- imps = [pd.DataFrame([[6, 5, 4, 3], [14, 13, 12, 11]],
- index=["importance0", "importance1"],
- columns=[3, 2, 1, 0]).T,
- pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
- index=["importance0", "importance1"],
- columns=[3, 2, 1, 0]).T]
- pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+import qiime2
+from qiime2.plugins import sample_classifier
- # and this should not occur now, but theoretically should just concat and
- # sort but not collapse if all column names are unique
- def test_mean_feature_importance_do_not_collapse(self):
- imps = [pd.DataFrame([4, 3, 2, 1], columns=["importance0"]),
- pd.DataFrame([16, 15, 14, 13], columns=["importance1"])]
- exp = pd.concat(imps, axis=1)
- pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+from q2_sample_classifier.visuals import (
+ _custom_palettes, _roc_palette, _roc_per_class, _roc_micro_average,
+ _roc_macro_average, _binarize_labels, _generate_roc_plots)
+from q2_sample_classifier.utilities import _extract_rfe_scores
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
- def test_null_feature_importance(self):
- exp = pd.DataFrame(
- [1, 1, 1], index=['o1', 'o2', 'o3'], columns=['importance'])
- exp.index.name = 'feature'
- tab = biom.Table(np.array([[1., 2., 3.], [3., 2., 1.], [7., 6., 9.]]),
- ['o1', 'o2', 'o3'], ['s1', 's2', 's3'])
- tab = _extract_features(tab)
- pdt.assert_frame_equal(_null_feature_importance(tab), exp)
- def test_load_data(self):
- # phony feature table
- id_map = {'0': 'peanut', '1': 'bugs', '2': 'qiime2', '3': 'matt',
- '4': 'pandas'}
- a = self.features.update_ids(id_map, axis='sample')
- # phony metadata, convert to qiime2.Metadata
- b = self.targets
- b.index = ['pandas', 'peanut', 'qiime1', 'flapjacks', 'bugs']
- b.index.name = '#SampleID'
- b = qiime2.Metadata(b.to_frame())
- # test that merge of tables is inner merge
- intersection = set(('peanut', 'bugs', 'pandas'))
- feature_data, targets = _load_data(a, b, missing_samples='ignore')
- exp = [{'c': 1.0, 'a': 1.0, 'b': 1.0}, {'c': 1.0, 'a': 1.0, 'b': 1.0},
- {'c': 1.0, 'a': 1.0, 'b': 1.0}]
- np.testing.assert_array_equal(feature_data, exp)
- self.assertEqual(set(targets.index), intersection)
+filterwarnings("ignore", category=UserWarning)
+filterwarnings("ignore", category=Warning)
class TestRFEExtractor(SampleClassifierTestPluginBase):
@@ -222,17 +37,20 @@ class TestRFEExtractor(SampleClassifierTestPluginBase):
self.y = np.random.randint(0, 2, 50)
self.exp1 = pd.Series([
- 0.52, 0.61, 0.475, 0.49833333, 0.515, 0.51166667, 0.43166667,
- 0.50666667, 0.61666667, 0.50333333, 0.58166667, 0.495, 0.51166667,
- 0.465, 0.57833333, 0.57833333, 0.70166667, 0.45333333, 0.60666667,
- 0.44166667], index=pd.Index(range(1, 21)), name='Accuracy')
+ 0.4999999999999999, 0.52, 0.52, 0.5399999999999999,
+ 0.44000000000000006, 0.52, 0.4600000000000001,
+ 0.5599999999999998, 0.52, 0.52, 0.5, 0.5399999999999999, 0.54,
+ 0.5599999999999999, 0.47999999999999987, 0.6199999999999999,
+ 0.5399999999999999, 0.5, 0.4999999999999999, 0.45999999999999996],
+ index=pd.Index(range(1, 21)), name='Accuracy')
self.exp2 = pd.Series([
- 0.39166666666666666, 0.47833333333333333, 0.5766666666666667,
- 0.6066666666666667, 0.5366666666666667, 0.4, 0.5316666666666666,
- 0.4, 0.57, 0.4533333333333333, 0.4416666666666666],
+ 0.5000000000000001, 0.52, 0.48, 0.5599999999999998, 0.5,
+ 0.5799999999999998, 0.54, 0.4600000000000001, 0.6,
+ 0.45999999999999996, 0.45999999999999996],
index=pd.Index([1] + [i for i in range(2, 21, 2)]),
name='Accuracy')
- self.exp3 = pd.Series({1: 0.38666667, 20: 0.44166667}, name='Accuracy')
+ self.exp3 = pd.Series({1: 0.4600000000000001, 20: 0.45999999999999996},
+ name='Accuracy')
def extract_rfe_scores_template(self, steps, expected):
selector = RFECV(RandomForestClassifier(
@@ -261,771 +79,6 @@ class TestRFEExtractor(SampleClassifierTestPluginBase):
self.extract_rfe_scores_template(21, self.exp3)
-class VisualsTests(SampleClassifierTestPluginBase):
-
- def test_linear_regress(self):
- res = _linear_regress(md['Value'], md['Time'])
- self.assertAlmostEqual(res.iloc[0]['Mean squared error'], 1.9413916666)
- self.assertAlmostEqual(res.iloc[0]['r-value'], 0.86414956372460128)
- self.assertAlmostEqual(res.iloc[0]['r-squared'], 0.74675446848541871)
- self.assertAlmostEqual(res.iloc[0]['P-value'], 0.00028880275858705694)
-
- def test_calculate_baseline_accuracy(self):
- accuracy = 0.9
- y_test = pd.Series(['a', 'a', 'a', 'b', 'b', 'b'], name="class")
- classifier_accuracy = _calculate_baseline_accuracy(y_test, accuracy)
- expected_results = (6, 3, 0.5, 1.8)
- for i in zip(classifier_accuracy, expected_results):
- self.assertEqual(i[0], i[1])
-
-
-class TestSemanticTypes(SampleClassifierTestPluginBase):
-
- def test_boolean_series_format_validate_positive(self):
- filepath = self.get_data_path('outliers.tsv')
- format = BooleanSeriesFormat(filepath, mode='r')
- # These should both just succeed
- format.validate('min')
- format.validate('max')
-
- def test_boolean_series_format_validate_negative_col_count(self):
- filepath = self.get_data_path('coordinates.tsv')
- format = BooleanSeriesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'BooleanSeriesFormat'):
- format.validate()
-
- def test_boolean_series_format_validate_negative_cell_values(self):
- filepath = self.get_data_path('predictions.tsv')
- format = BooleanSeriesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'BooleanSeriesFormat'):
- format.validate()
-
- def test_boolean_series_format_validate_negative_empty(self):
- filepath = self.get_data_path('empty_file.txt')
- format = BooleanSeriesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'one data record'):
- format.validate()
-
- def test_boolean_series_dir_fmt_validate_positive(self):
- filepath = self.get_data_path('outliers.tsv')
- shutil.copy(filepath, self.temp_dir.name)
- format = BooleanSeriesDirectoryFormat(self.temp_dir.name, mode='r')
- format.validate()
-
- def test_boolean_series_semantic_type_registration(self):
- self.assertRegisteredSemanticType(BooleanSeries)
-
- def test_sample_data_boolean_series_to_boolean_dir_fmt_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleData[BooleanSeries], BooleanSeriesDirectoryFormat)
-
- def test_pd_series_to_boolean_format(self):
- transformer = self.get_transformer(pd.Series, BooleanSeriesFormat)
- exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)
- exp = pd.Series([True, False, True, False, True, False],
- name='outlier', index=exp_index)
- obs = transformer(exp)
- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
- squeeze=True)
- self.assertEqual(sorted(exp), sorted(obs))
-
- def test_boolean_format_to_pd_series(self):
- _, obs = self.transform_format(
- BooleanSeriesFormat, pd.Series, 'outliers.tsv')
- exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)
- exp = pd.Series(['True', 'False', 'True', 'False', 'True', 'False'],
- name='outlier', index=exp_index)
- self.assertEqual(sorted(exp), sorted(obs))
-
- def test_boolean_format_to_metadata(self):
- _, obs = self.transform_format(
- BooleanSeriesFormat, qiime2.Metadata, 'outliers.tsv')
-
- exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], name='id')
- exp = pd.DataFrame([['True'], ['False'], ['True'],
- ['False'], ['True'], ['False']],
- columns=['outlier'], index=exp_index, dtype='str')
- exp = qiime2.Metadata(exp)
- self.assertEqual(obs, exp)
-
- # test predictions format
- def test_Predictions_format_validate_positive_numeric_predictions(self):
- filepath = self.get_data_path('predictions.tsv')
- format = PredictionsFormat(filepath, mode='r')
- format.validate(level='min')
- format.validate()
-
- def test_Predictions_format_validate_positive_nonnumeric_predictions(self):
- filepath = self.get_data_path('categorical_predictions.tsv')
- format = PredictionsFormat(filepath, mode='r')
- format.validate(level='min')
- format.validate()
-
- def test_Predictions_format_validate_negative(self):
- filepath = self.get_data_path('coordinates.tsv')
- format = PredictionsFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'PredictionsFormat'):
- format.validate()
-
- def test_Predictions_dir_fmt_validate_positive(self):
- filepath = self.get_data_path('predictions.tsv')
- shutil.copy(filepath, self.temp_dir.name)
- format = PredictionsDirectoryFormat(self.temp_dir.name, mode='r')
- format.validate()
-
- def test_RegressorPredictions_semantic_type_registration(self):
- self.assertRegisteredSemanticType(RegressorPredictions)
-
- def test_ClassifierPredictions_semantic_type_registration(self):
- self.assertRegisteredSemanticType(ClassifierPredictions)
-
- def test_RegressorPredictions_to_Predictions_dir_fmt_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleData[RegressorPredictions], PredictionsDirectoryFormat)
-
- def test_ClassifierPredictions_to_Predictions_dir_fmt_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleData[ClassifierPredictions], PredictionsDirectoryFormat)
-
- def test_pd_series_to_Predictions_format(self):
- transformer = self.get_transformer(pd.Series, PredictionsFormat)
- exp = pd.Series([1, 2, 3, 4],
- name='prediction', index=['a', 'b', 'c', 'd'])
- obs = transformer(exp)
- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
- squeeze=True)
- pdt.assert_series_equal(obs, exp)
-
- def test_pd_series_to_Predictions_format_allow_nans(self):
- transformer = self.get_transformer(pd.Series, PredictionsFormat)
- exp = pd.Series([1, np.nan, 3, np.nan],
- name='prediction', index=['a', 'b', 'c', 'd'])
- obs = transformer(exp)
- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
- squeeze=True)
- pdt.assert_series_equal(obs, exp)
-
- def test_Predictions_format_to_pd_series(self):
- _, obs = self.transform_format(
- PredictionsFormat, pd.Series, 'predictions.tsv')
- exp_index = pd.Index(['10249.C001.10SS', '10249.C002.05SS',
- '10249.C004.01SS', '10249.C004.11SS'],
- name='id', dtype=object)
- exp = pd.Series([4.5, 2.5, 0.5, 4.5], name='prediction',
- index=exp_index)
- pdt.assert_series_equal(obs[:4], exp)
-
- def test_Predictions_format_to_metadata(self):
- _, obs = self.transform_format(
- PredictionsFormat, qiime2.Metadata, 'predictions.tsv')
- exp_index = pd.Index(['10249.C001.10SS', '10249.C002.05SS',
- '10249.C004.01SS', '10249.C004.11SS'],
- name='id')
- exp = pd.DataFrame([4.5, 2.5, 0.5, 4.5], columns=['prediction'],
- index=exp_index)
- pdt.assert_frame_equal(obs.to_dataframe()[:4], exp)
-
- # test Importance format
- def test_Importance_format_validate_positive(self):
- filepath = self.get_data_path('importance.tsv')
- format = ImportanceFormat(filepath, mode='r')
- format.validate(level='min')
- format.validate()
-
- def test_Importance_format_validate_negative_nonnumeric(self):
- filepath = self.get_data_path('chardonnay.map.txt')
- format = ImportanceFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'numeric values'):
- format.validate()
-
- def test_Importance_format_validate_negative_empty(self):
- filepath = self.get_data_path('empty_file.txt')
- format = ImportanceFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'one data record'):
- format.validate()
-
- def test_Importance_format_validate_negative(self):
- filepath = self.get_data_path('garbage.txt')
- format = ImportanceFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'two or more fields'):
- format.validate()
-
- def test_Importance_dir_fmt_validate_positive(self):
- filepath = self.get_data_path('importance.tsv')
- shutil.copy(filepath, self.temp_dir.name)
- format = ImportanceDirectoryFormat(self.temp_dir.name, mode='r')
- format.validate()
-
- def test_Importance_semantic_type_registration(self):
- self.assertRegisteredSemanticType(Importance)
-
- def test_sample_data_Importance_to_Importance_dir_fmt_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- FeatureData[Importance], ImportanceDirectoryFormat)
-
- def test_pd_dataframe_to_Importance_format(self):
- transformer = self.get_transformer(pd.DataFrame, ImportanceFormat)
- exp = pd.DataFrame([1, 2, 3, 4],
- columns=['importance'], index=['a', 'b', 'c', 'd'])
- obs = transformer(exp)
- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0)
- pdt.assert_frame_equal(exp, obs)
-
- def test_Importance_format_to_pd_dataframe(self):
- _, obs = self.transform_format(
- ImportanceFormat, pd.DataFrame, 'importance.tsv')
- exp_index = pd.Index(['74ec9fe6ffab4ecff6d5def74298a825',
- 'c82032c40c98975f71892e4be561c87a',
- '79280cea51a6fe8a3432b2f266dd34db',
- 'f7686a74ca2d3729eb66305e8a26309b'],
- name='id')
- exp = pd.DataFrame([0.44469828320835586, 0.07760118417569697,
- 0.06570251750505914, 0.061718558716901406],
- columns=['importance'],
- index=exp_index)
- pdt.assert_frame_equal(exp, obs[:4])
-
- def test_Importance_format_to_metadata(self):
- _, obs = self.transform_format(
- ImportanceFormat, qiime2.Metadata, 'importance.tsv')
- exp_index = pd.Index(['74ec9fe6ffab4ecff6d5def74298a825',
- 'c82032c40c98975f71892e4be561c87a',
- '79280cea51a6fe8a3432b2f266dd34db',
- 'f7686a74ca2d3729eb66305e8a26309b'],
- name='id')
- exp = pd.DataFrame([0.44469828320835586, 0.07760118417569697,
- 0.06570251750505914, 0.061718558716901406],
- columns=['importance'],
- index=exp_index)
- pdt.assert_frame_equal(obs.to_dataframe()[:4], exp)
-
- # test Probabilities format
- def test_Probabilities_format_validate_positive(self):
- filepath = self.get_data_path('class_probabilities.tsv')
- format = ProbabilitiesFormat(filepath, mode='r')
- format.validate(level='min')
- format.validate()
-
- def test_Probabilities_format_validate_negative_nonnumeric(self):
- filepath = self.get_data_path('chardonnay.map.txt')
- format = ProbabilitiesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'numeric values'):
- format.validate()
-
- def test_Probabilities_format_validate_negative_empty(self):
- filepath = self.get_data_path('empty_file.txt')
- format = ProbabilitiesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'one data record'):
- format.validate()
-
- def test_Probabilities_format_validate_negative(self):
- filepath = self.get_data_path('garbage.txt')
- format = ProbabilitiesFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'two or more fields'):
- format.validate()
-
- def test_Probabilities_dir_fmt_validate_positive(self):
- filepath = self.get_data_path('class_probabilities.tsv')
- shutil.copy(filepath, self.temp_dir.name)
- format = ProbabilitiesDirectoryFormat(self.temp_dir.name, mode='r')
- format.validate()
-
- def test_Probabilities_semantic_type_registration(self):
- self.assertRegisteredSemanticType(Probabilities)
-
- def test_sample_data_Probabilities_to_Probs_dir_fmt_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleData[Probabilities], ProbabilitiesDirectoryFormat)
-
- def test_pd_dataframe_to_Probabilities_format(self):
- transformer = self.get_transformer(pd.DataFrame, ProbabilitiesFormat)
- exp = pd.DataFrame([[0.1, 0.77], [0.8, 0.4], [0.7, 0.1], [0.44, 0.73]],
- columns=['classA', 'classB'],
- index=['a', 'b', 'c', 'd'])
- obs = transformer(exp)
- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
- parse_dates=True)
- pdt.assert_frame_equal(exp, obs)
-
- def test_Probabilities_format_to_pd_dataframe(self):
- _, obs = self.transform_format(
- ProbabilitiesFormat, pd.DataFrame, 'class_probabilities.tsv')
- exp_index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
- name='id')
- exp = pd.DataFrame([[0.4446, 0.9828, 0.3208],
- [0.0776, 0.0118, 0.4175],
- [0.0657, 0.0251, 0.7505],
- [0.0617, 0.1855, 0.8716],
- [0.0281, 0.8616, 0.0291],
- [0.0261, 0.0253, 0.9075],
- [0.0252, 0.7385, 0.4068]],
- columns=['classA', 'classB', 'classC'],
- index=exp_index)
- pdt.assert_frame_equal(exp, obs)
-
- def test_Probabilities_format_to_metadata(self):
- _, obs = self.transform_format(
- ProbabilitiesFormat, qiime2.Metadata, 'class_probabilities.tsv')
- exp_index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
- name='id')
- exp = pd.DataFrame([[0.4446, 0.9828, 0.3208],
- [0.0776, 0.0118, 0.4175],
- [0.0657, 0.0251, 0.7505],
- [0.0617, 0.1855, 0.8716],
- [0.0281, 0.8616, 0.0291],
- [0.0261, 0.0253, 0.9075],
- [0.0252, 0.7385, 0.4068]],
- columns=['classA', 'classB', 'classC'],
- index=exp_index)
- pdt.assert_frame_equal(obs.to_dataframe(), exp)
-
- # test utility formats
- def test_pickle_format_validate_negative(self):
- filepath = self.get_data_path('coordinates.tsv')
- format = PickleFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'pickled file'):
- format.validate()
-
- def test_json_format_validate_negative(self):
- filepath = self.get_data_path('coordinates.tsv')
- format = JSONFormat(filepath, mode='r')
- with self.assertRaisesRegex(ValidationError, 'Expecting value'):
- format.validate()
-
- # this just checks that palette names are valid input
- def test_custom_palettes(self):
- confused = np.array([[1, 0], [0, 1]])
- for palette in _custom_palettes().keys():
- _plot_heatmap_from_confusion_matrix(confused, palette)
-
-
-class EstimatorsTests(SampleClassifierTestPluginBase):
-
- def setUp(self):
- super().setUp()
-
- def _load_biom(table_fp):
- table_fp = self.get_data_path(table_fp)
- table = qiime2.Artifact.load(table_fp)
- table = table.view(biom.Table)
- return table
-
- def _load_md(md_fp):
- md_fp = self.get_data_path(md_fp)
- md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
- md = qiime2.Metadata(md)
- return md
-
- def _load_nmc(md_fp, column):
- md_fp = self.get_data_path(md_fp)
- md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
- md = qiime2.NumericMetadataColumn(md[column])
- return md
-
- def _load_cmc(md_fp, column):
- md_fp = self.get_data_path(md_fp)
- md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
- md = qiime2.CategoricalMetadataColumn(md[column])
- return md
-
- self.table_chard_fp = _load_biom('chardonnay.table.qza')
- self.md_chard_fp = _load_md('chardonnay.map.txt')
- self.mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')
- self.table_ecam_fp = _load_biom('ecam-table-maturity.qza')
- self.md_ecam_fp = _load_md('ecam_map_maturity.txt')
- self.mdc_ecam_fp = _load_nmc('ecam_map_maturity.txt', 'month')
- self.exp_imp = pd.read_csv(
- self.get_data_path('importance.tsv'), sep='\t', header=0,
- index_col=0)
- self.exp_pred = pd.read_csv(
- self.get_data_path('predictions.tsv'), sep='\t', header=0,
- index_col=0, squeeze=True)
- index = pd.Index(['A', 'B', 'C', 'D'], name='id')
- self.table_percnorm = qiime2.Artifact.import_data(
- FeatureTable[PercentileNormalized], pd.DataFrame(
- [[20.0, 20.0, 50.0, 10.0], [10.0, 10.0, 70.0, 10.0],
- [90.0, 8.0, 1.0, 1.0], [30.0, 15.0, 20.0, 35.0]],
- index=index,
- columns=['feat1', 'feat2', 'feat3', 'feat4'])).view(biom.Table)
- self.mdc_percnorm = qiime2.CategoricalMetadataColumn(
- pd.Series(['X', 'X', 'Y', 'Y'], index=index, name='name'))
-
- # test feature extraction
- def test_extract_features(self):
- table = self.table_ecam_fp
- dicts = _extract_features(table)
- dv = DictVectorizer()
- dv.fit(dicts)
- features = table.ids('observation')
- self.assertEqual(set(dv.get_feature_names()), set(features))
- self.assertEqual(len(dicts), len(table.ids()))
- for dict_row, (table_row, _, _) in zip(dicts, table.iter()):
- for feature, count in zip(features, table_row):
- if count == 0:
- self.assertTrue(feature not in dict_row)
- else:
- self.assertEqual(dict_row[feature], count)
-
- def test_classify_samples_from_dist(self):
- # -- setup -- #
- # 1,2 are a group, 3,4 are a group
- sample_ids = ('f1', 'f2', 's1', 's2')
- distance_matrix = skbio.DistanceMatrix([
- [0, 1, 4, 4],
- [1, 0, 4, 4],
- [4, 4, 0, 1],
- [4, 4, 1, 0],
- ], ids=sample_ids)
-
- dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
- categories = pd.Series(('skinny', 'skinny', 'fat', 'fat'),
- index=sample_ids[::-1], name='body_mass')
- categories.index.name = 'SampleID'
- metadata = qiime2.CategoricalMetadataColumn(categories)
-
- # -- test -- #
- res = sample_classifier.actions.classify_samples_from_dist(
- distance_matrix=dm, metadata=metadata, k=1)
- pred = res[0].view(pd.Series).sort_values()
- expected = pd.Series(('fat', 'skinny', 'fat', 'skinny'),
- index=['f1', 's1', 'f2', 's2'])
- not_expected = pd.Series(('fat', 'fat', 'fat', 'skinny'),
- index=sample_ids)
-
- # order matters for pd.Series.equals()
- self.assertTrue(expected.sort_index().equals(pred.sort_index()))
- self.assertFalse(not_expected.sort_index().equals(pred.sort_index()))
-
- def test_classify_samples_from_dist_with_group_of_single_item(self):
- # -- setup -- #
- # 1 is a group, 2,3,4 are a group
- sample_ids = ('f1', 's1', 's2', 's3')
- distance_matrix = skbio.DistanceMatrix([
- [0, 2, 3, 3],
- [2, 0, 1, 1],
- [3, 1, 0, 1],
- [3, 1, 1, 0],
- ], ids=sample_ids)
-
- dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
- categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
- index=sample_ids, name='body_mass')
- categories.index.name = 'SampleID'
- metadata = qiime2.CategoricalMetadataColumn(categories)
-
- # -- test -- #
- res = sample_classifier.actions.classify_samples_from_dist(
- distance_matrix=dm, metadata=metadata, k=1)
- pred = res[0].view(pd.Series)
- expected = pd.Series(('skinny', 'skinny', 'skinny', 'skinny'),
- index=sample_ids)
-
- self.assertTrue(expected.sort_index().equals(pred.sort_index()))
-
- def test_2nn(self):
- # -- setup -- #
- # 2 nearest neighbors of each sample are
- # f1: s1, s2 (classified as skinny)
- # s1: f1, s2 (closer to f1 so fat)
- # s2: f1, (s1 or s3) (closer to f1 so fat)
- # s3: s1, s2 (skinny)
- sample_ids = ('f1', 's1', 's2', 's3')
- distance_matrix = skbio.DistanceMatrix([
- [0, 2, 1, 5],
- [2, 0, 3, 4],
- [1, 3, 0, 3],
- [5, 4, 3, 0],
- ], ids=sample_ids)
-
- dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
- categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
- index=sample_ids, name='body_mass')
- categories.index.name = 'SampleID'
- metadata = qiime2.CategoricalMetadataColumn(categories)
-
- # -- test -- #
- res = sample_classifier.actions.classify_samples_from_dist(
- distance_matrix=dm, metadata=metadata, k=2)
- pred = res[0].view(pd.Series)
- expected = pd.Series(('skinny', 'fat', 'fat', 'skinny'),
- index=sample_ids)
- self.assertTrue(expected.sort_index().equals(pred.sort_index()))
-
- # test that each classifier works and delivers an expected accuracy result
- # when a random seed is set.
- def test_classifiers(self):
- for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier',
- 'GradientBoostingClassifier', 'AdaBoostClassifier',
- 'LinearSVC', 'SVC', 'KNeighborsClassifier']:
- table_fp = self.get_data_path('chardonnay.table.qza')
- table = qiime2.Artifact.load(table_fp)
- res = sample_classifier.actions.classify_samples(
- table=table, metadata=self.mdc_chard_fp,
- test_size=0.5, cv=1, n_estimators=10, n_jobs=1,
- estimator=classifier, random_state=123,
- parameter_tuning=False, optimize_feature_selection=False,
- missing_samples='ignore')
- pred = res[2].view(pd.Series)
- pred, truth = _match_series_or_die(
- pred, self.mdc_chard_fp.to_series(), 'ignore')
- accuracy = accuracy_score(truth, pred)
- self.assertAlmostEqual(
- accuracy, seeded_results[classifier], places=4,
- msg='Accuracy of %s classifier was %f, but expected %f' % (
- classifier, accuracy, seeded_results[classifier]))
-
- # test that the plugin methods/visualizers work
- def test_regress_samples_ncv(self):
- y_pred, importances = regress_samples_ncv(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, n_jobs=1, stratify=True, parameter_tuning=True,
- missing_samples='ignore')
-
- def test_classify_samples_ncv(self):
- y_pred, importances, probabilities = classify_samples_ncv(
- self.table_chard_fp, self.mdc_chard_fp, random_state=123,
- n_estimators=2, n_jobs=1, missing_samples='ignore')
-
- # test reproducibility of classifier results, probabilities
- def test_classify_samples_ncv_accuracy(self):
- dat = biom.Table(np.array(
- [[4446, 9828, 3208, 776, 118, 4175, 657, 251, 7505, 617],
- [1855, 8716, 3257, 1251, 3205, 2557, 4251, 7405, 1417, 1215],
- [6616, 281, 8616, 291, 261, 253, 9075, 252, 7385, 4068]]),
- observation_ids=['o1', 'o2', 'o3'],
- sample_ids=['s1', 's2', 's3', 's4', 's5',
- 's6', 's7', 's8', 's9', 's10'])
- md = qiime2.CategoricalMetadataColumn(pd.Series(
- ['red', 'red', 'red', 'red', 'red',
- 'blue', 'blue', 'blue', 'blue', 'blue'],
- index=pd.Index(['s1', 's2', 's3', 's4', 's5',
- 's6', 's7', 's8', 's9', 's10'],
- name='sample-id'), name='color'))
- y_pred, importances, probabilities = classify_samples_ncv(
- dat, md, random_state=123, n_estimators=2, n_jobs=1,
- missing_samples='ignore')
- exp_pred = pd.Series(
- ['blue', 'blue', 'blue', 'red', 'blue',
- 'blue', 'blue', 'red', 'red', 'blue'],
- index=pd.Index(['s1', 's7', 's5', 's9', 's3', 's10', 's4', 's6',
- 's2', 's8'], dtype='object', name='SampleID'),
- name='prediction')
- exp_importances = pd.DataFrame(
- [0.5551111111111111, 0.2671111111111111, 0.1777777777777778],
- index=pd.Index(['o3', 'o1', 'o2']), columns=['importance'])
- exp_probabilities = pd.DataFrame(
- [[1., 0.], [1., 0.], [1., 0.], [0., 1.], [0.5, 0.5], [0.5, 0.5],
- [0.5, 0.5], [0., 1.], [0., 1.], [0.5, 0.5]],
- index=pd.Index(['s1', 's7', 's5', 's9', 's3', 's10', 's4', 's6',
- 's2', 's8'], name='SampleID'),
- columns=['blue', 'red'])
- pdt.assert_series_equal(y_pred, exp_pred)
- pdt.assert_frame_equal(importances, exp_importances)
- pdt.assert_frame_equal(probabilities, exp_probabilities)
-
- # test ncv a second time with KNeighborsRegressor (no feature importance)
- def test_regress_samples_ncv_knn(self):
- y_pred, importances = regress_samples_ncv(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, n_jobs=1, stratify=False, parameter_tuning=False,
- estimator='KNeighborsRegressor', missing_samples='ignore')
-
- # test that ncv gives expected results
- def test_regress_samples_ncv_accuracy(self):
- y_pred, importances = regress_samples_ncv(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, n_jobs=1, missing_samples='ignore')
- pdt.assert_series_equal(y_pred, self.exp_pred)
- pdt.assert_frame_equal(importances, self.exp_imp)
-
- # test that fit_* methods output consistent importance scores
- def test_fit_regressor(self):
- pipeline, importances = fit_regressor(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, n_jobs=1, missing_samples='ignore')
- exp_imp = pd.read_csv(
- self.get_data_path('importance_cv.tsv'), sep='\t', header=0,
- index_col=0)
- pdt.assert_frame_equal(importances, exp_imp)
-
- # just make sure this method runs. Uses the same internal function as
- # fit_regressor, so importance score consistency is covered by the above
- # test.
- def test_fit_classifier(self):
- pipeline, importances = fit_classifier(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, n_jobs=1, optimize_feature_selection=True,
- parameter_tuning=True, missing_samples='ignore')
-
- # test that each regressor works and delivers an expected accuracy result
- # when a random seed is set.
- def test_regressors(self):
- for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor',
- 'GradientBoostingRegressor', 'AdaBoostRegressor',
- 'Lasso', 'Ridge', 'ElasticNet',
- 'KNeighborsRegressor', 'LinearSVR', 'SVR']:
- table_fp = self.get_data_path('ecam-table-maturity.qza')
- table = qiime2.Artifact.load(table_fp)
- res = sample_classifier.actions.regress_samples(
- table=table, metadata=self.mdc_ecam_fp,
- test_size=0.5, cv=1, n_estimators=10, n_jobs=1,
- estimator=regressor, random_state=123,
- parameter_tuning=False, optimize_feature_selection=False,
- missing_samples='ignore', stratify=True)
- pred = res[2].view(pd.Series)
- pred, truth = _match_series_or_die(
- pred, self.mdc_ecam_fp.to_series(), 'ignore')
- accuracy = mean_squared_error(truth, pred)
- self.assertAlmostEqual(
- accuracy, seeded_results[regressor], places=4,
- msg='Accuracy of %s regressor was %f, but expected %f' % (
- regressor, accuracy, seeded_results[regressor]))
-
- # test adaboost base estimator trainer
- def test_train_adaboost_base_estimator(self):
- abe = _train_adaboost_base_estimator(
- self.table_chard_fp, self.mdc_chard_fp, 'Region',
- n_estimators=10, n_jobs=1, cv=3, random_state=None,
- parameter_tuning=True, classification=True,
- missing_samples='ignore')
- self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier)
-
- # test some invalid inputs/edge cases
- def test_invalids(self):
- estimator, pad, pt = _set_parameters_and_estimator(
- 'RandomForestClassifier', self.table_chard_fp, self.md_chard_fp,
- 'Region', n_estimators=10, n_jobs=1, cv=1,
- random_state=123, parameter_tuning=False, classification=True,
- missing_samples='ignore')
- regressor, pad, pt = _set_parameters_and_estimator(
- 'RandomForestRegressor', self.table_chard_fp, self.md_chard_fp,
- 'Region', n_estimators=10, n_jobs=1, cv=1,
- random_state=123, parameter_tuning=False, classification=True,
- missing_samples='ignore')
-
- def test_split_table_no_rounding_error(self):
- X_train, X_test = split_table(
- self.table_chard_fp, self.mdc_chard_fp, test_size=0.5,
- random_state=123, stratify=True, missing_samples='ignore')
- self.assertEqual(len(X_train.ids()) + len(X_test.ids()), 21)
-
- def test_split_table_no_split(self):
- X_train, X_test = split_table(
- self.table_chard_fp, self.mdc_chard_fp, test_size=0.0,
- random_state=123, stratify=True, missing_samples='ignore')
- self.assertEqual(len(X_train.ids()), 21)
-
- def test_split_table_invalid_test_size(self):
- with self.assertRaisesRegex(ValueError, "at least two samples"):
- X_train, X_test = split_table(
- self.table_chard_fp, self.mdc_chard_fp, test_size=1.0,
- random_state=123, stratify=True, missing_samples='ignore')
-
- def test_split_table_percnorm(self):
- X_train, X_test = split_table(
- self.table_percnorm, self.mdc_percnorm, test_size=0.5,
- random_state=123, stratify=True, missing_samples='ignore')
- self.assertEqual(len(X_train.ids()) + len(X_test.ids()), 4)
-
- # test experimental functions
- def test_detect_outliers(self):
- detect_outliers(self.table_chard_fp, self.md_chard_fp,
- random_state=123, n_jobs=1, contamination=0.05)
-
- def test_detect_outliers_with_subsets(self):
- detect_outliers(self.table_chard_fp, self.md_chard_fp,
- random_state=123, n_jobs=1, contamination=0.05,
- subset_column='Vineyard', subset_value=1)
-
- def test_detect_outliers_raise_error_on_missing_subset_data(self):
- with self.assertRaisesRegex(ValueError, "must both be provided"):
- detect_outliers(self.table_chard_fp, self.md_chard_fp,
- random_state=123, n_jobs=1, contamination=0.05,
- subset_column='Vineyard', subset_value=None)
- with self.assertRaisesRegex(ValueError, "must both be provided"):
- detect_outliers(self.table_chard_fp, self.md_chard_fp,
- random_state=123, n_jobs=1, contamination=0.05,
- subset_column=None, subset_value=1)
-
- # just test that this works by making sure a classifier trained on samples
- # x, y, and z predicts the correct metadata values for those same samples.
- def test_predict_classifications(self):
- for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier',
- 'GradientBoostingClassifier', 'AdaBoostClassifier',
- 'LinearSVC', 'SVC', 'KNeighborsClassifier']:
- estimator, importances = fit_classifier(
- self.table_chard_fp, self.mdc_chard_fp, random_state=123,
- n_estimators=2, estimator=classifier, n_jobs=1,
- missing_samples='ignore')
- pred, prob = predict_classification(self.table_chard_fp, estimator)
- exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna()
- # reindex both pred and exp because not all samples present in pred
- # are present in the metadata! (hence missing_samples='ignore')
- sample_ids = pred.index.intersection(exp.index)
- pred = pred.loc[sample_ids]
- exp = exp.loc[sample_ids]
- # test that expected number of correct results is achieved (these
- # are mostly quite high as we would expect (total n=21))
- correct_results = np.sum(pred == exp)
- self.assertEqual(
- correct_results, seeded_predict_results[classifier],
- msg='Accuracy of %s classifier was %f, but expected %f' % (
- classifier, correct_results,
- seeded_predict_results[classifier]))
-
- def test_predict_regressions(self):
- for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor',
- 'GradientBoostingRegressor', 'AdaBoostRegressor',
- 'Lasso', 'Ridge', 'ElasticNet',
- 'KNeighborsRegressor', 'SVR', 'LinearSVR']:
- estimator, importances = fit_regressor(
- self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
- n_estimators=2, estimator=regressor, n_jobs=1,
- missing_samples='ignore')
- pred = predict_regression(self.table_ecam_fp, estimator)
- exp = self.mdc_ecam_fp.to_series()
- # reindex both pred and exp because not all samples present in pred
- # are present in the metadata! (hence missing_samples='ignore')
- sample_ids = pred.index.intersection(exp.index)
- pred = pred.loc[sample_ids]
- exp = exp.loc[sample_ids]
- # test that expected MSE is achieved (these are mostly quite high
- # as we would expect)
- mse = mean_squared_error(exp, pred)
- self.assertAlmostEqual(
- mse, seeded_predict_results[regressor],
- msg='Accuracy of %s regressor was %f, but expected %f' % (
- regressor, mse, seeded_predict_results[regressor]))
-
- # make sure predict still works when features are given in a different
- # order from training set.
- def test_predict_feature_order_aint_no_thing(self):
- table = self.table_ecam_fp
- estimator, importances = fit_regressor(
- table, self.mdc_ecam_fp, random_state=123, n_estimators=2,
- n_jobs=1, missing_samples='ignore')
-
- # randomly shuffle and reorder features in biom table.
- feature_ids = table.ids(axis='observation')
- # look ma no seed! we should get the same result no matter the order.
- np.random.shuffle(feature_ids)
- shuffled_table = table.sort_order(feature_ids, axis='observation')
-
- # now predict values on shuffled data
- pred = predict_regression(shuffled_table, estimator)
- exp = self.mdc_ecam_fp.to_series()
- # reindex both pred and exp because not all samples present in pred
- # are present in the metadata! (hence missing_samples='ignore')
- sample_ids = pred.index.intersection(exp.index)
- pred = pred.loc[sample_ids]
- exp = exp.loc[sample_ids]
- # test that expected MSE is achieved (these are mostly quite high
- # as we would expect)
- mse = mean_squared_error(exp, pred)
- self.assertAlmostEqual(
- mse, seeded_predict_results['RandomForestRegressor'])
-
-
# test classifier pipelines succeed on binary data
class TestBinaryClassification(SampleClassifierTestPluginBase):
@@ -1063,353 +116,6 @@ class TestBinaryClassification(SampleClassifierTestPluginBase):
pdt.assert_series_equal(exp, res[0].view(pd.Series))
-class TestHeatmap(SampleClassifierTestPluginBase):
-
- def setUp(self):
- super().setUp()
- md_ecam = self.get_data_path('ecam_map_maturity.txt')
- md_ecam = qiime2.Metadata.load(md_ecam)
- self.md_ecam = md_ecam.get_column('delivery')
- table_ecam = self.get_data_path('ecam-table-maturity.qza')
- table_ecam = qiime2.Artifact.load(table_ecam)
- self.table_ecam, = feature_table.actions.filter_samples(
- table_ecam, metadata=md_ecam)
- imp = pd.read_csv(
- self.get_data_path('importance.tsv'), sep='\t', header=0,
- index_col=0)
- self.imp = qiime2.Artifact.import_data('FeatureData[Importance]', imp)
-
- def test_heatmap_default_feature_count_zero(self):
- heatmap, table, = sample_classifier.actions.heatmap(
- self.table_ecam, self.imp, self.md_ecam, group_samples=True,
- feature_count=0)
- self.assertEqual(table.view(biom.Table).shape, (1056, 2))
-
- def test_heatmap_importance_threshold(self):
- heatmap, table, = sample_classifier.actions.heatmap(
- self.table_ecam, self.imp, self.md_ecam,
- importance_threshold=0.017, group_samples=False, feature_count=0)
- self.assertEqual(table.view(biom.Table).shape, (10, 121))
-
- def test_heatmap_feature_count(self):
- heatmap, table, = sample_classifier.actions.heatmap(
- self.table_ecam, self.imp, self.md_ecam, group_samples=True,
- feature_count=20)
- self.assertEqual(table.view(biom.Table).shape, (20, 2))
-
- def test_heatmap_must_group_or_die(self):
- with self.assertRaisesRegex(ValueError, "metadata are not optional"):
- heatmap, table, = sample_classifier.actions.heatmap(
- self.table_ecam, self.imp, sample_metadata=None,
- group_samples=True)
-
-
-class NowLetsTestTheActions(SampleClassifierTestPluginBase):
-
- def setUp(self):
- super().setUp()
- md = pd.Series(['a', 'a', 'b', 'b', 'b'],
- index=['a', 'b', 'c', 'd', 'e'], name='bugs')
- md.index.name = 'SampleID'
- self.md = qiime2.CategoricalMetadataColumn(md)
-
- tab = biom.Table(
- np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
- [8, 6, 4, 1, 0], [8, 6, 4, 1, 0]]),
- observation_ids=['v', 'w', 'x', 'y', 'z'],
- sample_ids=['a', 'b', 'c', 'd', 'e'])
- self.tab = qiime2.Artifact.import_data('FeatureTable[Frequency]', tab)
-
- md2 = pd.DataFrame({'trash': ['a', 'a', 'b', 'b', 'b', 'junk'],
- 'floats': [0.1, 0.1, 1.3, 1.8, 1000.1, 0.1],
- 'ints': [0, 1, 2, 2, 2, 0],
- 'nans': [1, 1, 2, 2, np.nan, np.nan],
- 'negatives': [-7, -3, -1.2, -4, -9, -1]},
- index=['a', 'b', 'c', 'd', 'e', 'peanut'])
- md2.index.name = 'SampleID'
- self.md2 = qiime2.Metadata(md2)
-
- # let's make sure the correct transformers are in place! See issue 114
- # if this runs without error, that's good enough for me. We already
- # validate the function above.
- def test_action_split_table(self):
- sample_classifier.actions.split_table(self.tab, self.md, test_size=0.5)
-
- def test_metatable(self):
- exp = biom.Table(
- np.array([[0.1, 0.1, 1.3, 1.8, 1000.1, 0.1],
- [0, 1, 2, 2, 2, 0]]),
- observation_ids=['floats', 'ints'],
- sample_ids=['a', 'b', 'c', 'd', 'e', 'peanut'])
- res, = sample_classifier.actions.metatable(
- self.md2, missing_values='drop_features')
- report = res.view(biom.Table).descriptive_equality(exp)
- self.assertIn('Tables appear equal', report, report)
-
- def test_metatable_missing_error(self):
- with self.assertRaisesRegex(ValueError, "missing values"):
- sample_classifier.actions.metatable(
- self.md2, missing_values='error')
-
- def test_metatable_drop_samples(self):
- exp = biom.Table(
- np.array([[3, 6, 7, 3], [3, 4, 5, 6], [8, 6, 4, 1],
- [8, 6, 4, 1], [8, 6, 4, 1],
- [0.1, 0.1, 1.3, 1.8],
- [0, 1, 2, 2], [1, 1, 2, 2]]),
- observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
- 'nans'],
- sample_ids=['a', 'b', 'c', 'd'])
- res, = sample_classifier.actions.metatable(
- self.md2, self.tab, missing_values='drop_samples')
- report = res.view(biom.Table).descriptive_equality(exp)
- self.assertIn('Tables appear equal', report, report)
-
- def test_metatable_fill_na(self):
- exp = biom.Table(
- np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
- [8, 6, 4, 1, 0], [8, 6, 4, 1, 0],
- [0.1, 0.1, 1.3, 1.8, 1000.1],
- [0, 1, 2, 2, 2], [1., 1., 2., 2., 0.]]),
- observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
- 'nans'],
- sample_ids=['a', 'b', 'c', 'd', 'e'])
- res, = sample_classifier.actions.metatable(
- self.md2, self.tab, missing_values='fill')
- report = res.view(biom.Table).descriptive_equality(exp)
- self.assertIn('Tables appear equal', report, report)
-
- def test_metatable_with_merge(self):
- exp = biom.Table(
- np.array([[3, 6, 7, 3, 6], [3, 4, 5, 6, 2], [8, 6, 4, 1, 0],
- [8, 6, 4, 1, 0], [8, 6, 4, 1, 0],
- [0.1, 0.1, 1.3, 1.8, 1000.1],
- [0, 1, 2, 2, 2]]),
- observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints'],
- sample_ids=['a', 'b', 'c', 'd', 'e'])
- res, = sample_classifier.actions.metatable(
- self.md2, self.tab, missing_values='drop_features')
- report = res.view(biom.Table).descriptive_equality(exp)
- self.assertIn('Tables appear equal', report, report)
-
- def test_metatable_with_merge_successful_inner_join(self):
- exp = biom.Table(
- np.array([[3, 6, 7, 3], [3, 4, 5, 6], [8, 6, 4, 1],
- [8, 6, 4, 1], [8, 6, 4, 1], [0.1, 0.1, 1.3, 1.8],
- [0, 1, 2, 2], [1., 1., 2., 2.]]),
- observation_ids=['v', 'w', 'x', 'y', 'z', 'floats', 'ints',
- 'nans'],
- sample_ids=['a', 'b', 'c', 'd'])
- res, = sample_classifier.actions.metatable(
- self.md2.filter_ids(['a', 'b', 'c', 'd']), self.tab,
- missing_values='error')
- report = res.view(biom.Table).descriptive_equality(exp)
- self.assertIn('Tables appear equal', report, report)
-
- def test_metatable_with_merge_error_inner_join(self):
- with self.assertRaisesRegex(ValueError, "Missing samples"):
- sample_classifier.actions.metatable(
- self.md2.filter_ids(['a', 'b', 'c', 'd']),
- self.tab, missing_samples='error',
- missing_values='drop_samples')
-
- def test_metatable_empty_metadata_after_drop_all_unique(self):
- with self.assertRaisesRegex(
- ValueError, "All metadata"): # are belong to us
- sample_classifier.actions.metatable(
- self.md2.filter_ids(['b', 'c']), self.tab,
- missing_values='drop_samples', drop_all_unique=True)
-
- def test_metatable_no_samples_after_filtering(self):
- junk_md = pd.DataFrame(
- {'trash': ['a', 'a', 'b', 'b', 'b', 'junk'],
- 'floats': [np.nan, np.nan, np.nan, 1.8, 1000.1, 0.1],
- 'ints': [0, 1, 2, np.nan, 2, 0],
- 'nans': [1, 1, 2, 2, np.nan, np.nan],
- 'negatives': [-7, -4, -1.2, -4, -9, -1]},
- index=['a', 'b', 'c', 'd', 'e', 'peanut'])
- junk_md.index.name = 'SampleID'
- junk_md = qiime2.Metadata(junk_md)
- with self.assertRaisesRegex(ValueError, "All metadata samples"):
- sample_classifier.actions.metatable(
- junk_md, missing_values='drop_samples')
-
-
-class SampleEstimatorTestBase(SampleClassifierTestPluginBase):
- package = 'q2_sample_classifier.tests'
-
- def setUp(self):
- super().setUp()
-
- def _load_biom(table_fp):
- table_fp = self.get_data_path(table_fp)
- table = qiime2.Artifact.load(table_fp)
- table = table.view(biom.Table)
- return table
-
- def _load_cmc(md_fp, column):
- md_fp = self.get_data_path(md_fp)
- md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
- md = qiime2.CategoricalMetadataColumn(md[column])
- return md
-
- table_chard_fp = _load_biom('chardonnay.table.qza')
- mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')
-
- pipeline, importances = fit_classifier(
- table_chard_fp, mdc_chard_fp, random_state=123,
- n_estimators=2, n_jobs=1, optimize_feature_selection=True,
- parameter_tuning=True, missing_samples='ignore')
- transformer = self.get_transformer(
- Pipeline, SampleEstimatorDirFmt)
- self._sklp = transformer(pipeline)
- sklearn_pipeline = self._sklp.sklearn_pipeline.view(PickleFormat)
- self.sklearn_pipeline = str(sklearn_pipeline)
- self.pipeline = pipeline
-
- def _custom_setup(self, version):
- with open(os.path.join(self.temp_dir.name,
- 'sklearn_version.json'), 'w') as fh:
- fh.write(json.dumps({'sklearn-version': version}))
- shutil.copy(self.sklearn_pipeline, self.temp_dir.name)
- return SampleEstimatorDirFmt(
- self.temp_dir.name, mode='r')
-
-
-# This class really just checks that these visualizers run without error. Yay.
-# Also test some internal nuts/bolts but there's not much else we can do..
-class TestPlottingVisualizers(SampleClassifierTestPluginBase):
- def setUp(self):
- super().setUp()
- self.tmpd = join(self.temp_dir.name, 'viz')
- mkdir(self.tmpd)
-
- self.a = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='site',
- index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
- self.a.index.name = 'SampleID'
- self.bogus = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='site',
- index=['a1', 'e3', 'f5', 'b2', 'z1', 'c2'])
- self.bogus.index.name = 'SampleID'
- self.c = pd.Series(
- [0, 1, 2, 3], index=['a', 'b', 'c', 'd'], name='peanuts')
- self.c.index.name = 'SampleID'
-
- def test_confusion_matrix(self):
- b = qiime2.CategoricalMetadataColumn(self.a)
- confusion_matrix(self.tmpd, self.a, b)
-
- def test_confusion_matrix_class_overlap_error(self):
- b = pd.Series([1, 2, 3, 4, 5, 6], name='site',
- index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
- b.index.name = 'id'
- b = qiime2.NumericMetadataColumn(b)
- with self.assertRaisesRegex(ValueError, "do not overlap"):
- confusion_matrix(self.tmpd, self.a, b)
-
- def test_confusion_matrix_vmin_too_high(self):
- b = qiime2.CategoricalMetadataColumn(self.a)
- with self.assertRaisesRegex(ValueError, r'vmin must be less than.*\s\s'
- r'0\.5.*greater.*0\.0'):
- confusion_matrix(self.tmpd, self.a, b, vmin=.5, vmax=None)
-
- def test_confusion_matrix_vmax_too_low(self):
- b = qiime2.CategoricalMetadataColumn(self.a)
- with self.assertRaisesRegex(ValueError, r'vmax must be greater than.*'
- r'\s\s0\.5.*less.*1\.0'):
- confusion_matrix(self.tmpd, self.a, b, vmin=None, vmax=.5)
-
- def test_confusion_matrix_vmin_too_high_and_vmax_too_low(self):
- b = qiime2.CategoricalMetadataColumn(self.a)
- with self.assertRaisesRegex(ValueError, r'vmin must be less than.*\s'
- r'\s0\.5.*greater.*0\.0\s.*vmax must be '
- r'greater than.*\s\s0\.5.*less.*1\.0'):
- confusion_matrix(self.tmpd, self.a, b, vmin=.5, vmax=.5)
-
- # test confusion matrix plotting independently to see how it handles
- # partially overlapping classes when true labels are superset
- def test_predict_and_plot_true_labels_are_superset(self):
- b = pd.Series(['a', 'a', 'b', 'b', 'b', 'b'], name='site',
- index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
- exp = pd.DataFrame(
- [[1., 0., 0., ''],
- [0., 1., 0., ''],
- [0., 1., 0., ''],
- ['', '', '', 0.666666666],
- ['', '', '', 0.3333333333],
- ['', '', '', 2.]],
- columns=['a', 'b', 'c', 'Overall Accuracy'],
- index=['a', 'b', 'c', 'Overall Accuracy', 'Baseline Accuracy',
- 'Accuracy Ratio'])
- predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
- pdt.assert_frame_equal(exp, predictions)
-
- # test confusion matrix plotting independently to see how it handles
- # partially overlapping classes when true labels are superset
- def test_predict_and_plot_true_labels_are_subset(self):
- b = pd.Series(['a', 'a', 'b', 'b', 'c', 'd'], name='site',
- index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
- exp = pd.DataFrame(
- [[1., 0., 0., 0., ''],
- [0., 1., 0., 0., ''],
- [0., 0., 0.5, 0.5, ''],
- [0., 0., 0., 0., ''],
- ['', '', '', '', 0.8333333333],
- ['', '', '', '', 0.3333333333],
- ['', '', '', '', 2.5]],
- columns=['a', 'b', 'c', 'd', 'Overall Accuracy'],
- index=['a', 'b', 'c', 'd', 'Overall Accuracy', 'Baseline Accuracy',
- 'Accuracy Ratio'])
- predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
- pdt.assert_frame_equal(exp, predictions)
-
- # test confusion matrix plotting independently to see how it handles
- # partially overlapping classes when true labels are mutually exclusive
- def test_predict_and_plot_true_labels_are_mutually_exclusive(self):
- b = pd.Series(['a', 'a', 'e', 'e', 'd', 'd'], name='site',
- index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
- exp = pd.DataFrame(
- [[1., 0., 0., 0., 0., ''],
- [0., 0., 0., 0., 1., ''],
- [0., 0., 0., 1., 0., ''],
- [0., 0., 0., 0., 0., ''],
- [0., 0., 0., 0., 0., ''],
- ['', '', '', '', '', 0.3333333333],
- ['', '', '', '', '', 0.3333333333],
- ['', '', '', '', '', 1.]],
- columns=['a', 'b', 'c', 'd', 'e', 'Overall Accuracy'],
- index=['a', 'b', 'c', 'd', 'e', 'Overall Accuracy',
- 'Baseline Accuracy', 'Accuracy Ratio'])
- predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
- pdt.assert_frame_equal(exp, predictions)
-
- def test_scatterplot(self):
- b = qiime2.NumericMetadataColumn(self.c)
- scatterplot(self.tmpd, self.c, b)
-
- def test_add_sample_size_to_xtick_labels(self):
- labels = _add_sample_size_to_xtick_labels(self.a, ['a', 'b', 'c'])
- exp = ['a (n=2)', 'b (n=2)', 'c (n=2)']
- self.assertListEqual(labels, exp)
-
- # now test performance when extra classes are present
- def test_add_sample_size_to_xtick_labels_extra_classes(self):
- labels = _add_sample_size_to_xtick_labels(
- self.a, [0, 'a', 'b', 'bb', 'c'])
- exp = ['0 (n=0)', 'a (n=2)', 'b (n=2)', 'bb (n=0)', 'c (n=2)']
- self.assertListEqual(labels, exp)
-
- def test_match_series_or_die(self):
- exp = pd.Series(['a', 'b', 'c'], name='site', index=['a1', 'b2', 'c2'])
- exp.index.name = 'SampleID'
- a, b = _match_series_or_die(self.a, self.bogus, 'ignore')
- pdt.assert_series_equal(exp, a)
- pdt.assert_series_equal(exp, b)
-
- def test_match_series_or_die_missing_samples(self):
- with self.assertRaisesRegex(ValueError, "Missing samples"):
- a, b = _match_series_or_die(self.a, self.bogus, 'error')
-
-
class TestROC(SampleClassifierTestPluginBase):
def setUp(self):
super().setUp()
@@ -1511,127 +217,3 @@ class TestBinarize(SampleClassifierTestPluginBase):
exp = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1],
[1, 0, 0], [0, 1, 0], [0, 0, 1]])
np.testing.assert_array_equal(exp, labels)
-
-
-class TestTypes(SampleClassifierTestPluginBase):
- def test_sample_estimator_semantic_type_registration(self):
- self.assertRegisteredSemanticType(SampleEstimator)
-
- def test_classifier_semantic_type_registration(self):
- self.assertRegisteredSemanticType(Classifier)
-
- def test_regressor_semantic_type_registration(self):
- self.assertRegisteredSemanticType(Regressor)
-
- def test_sample_classifier_semantic_type_to_format_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleEstimator[Classifier], SampleEstimatorDirFmt)
-
- def test_sample_regressor_semantic_type_to_format_registration(self):
- self.assertSemanticTypeRegisteredToFormat(
- SampleEstimator[Regressor], SampleEstimatorDirFmt)
-
-
-class TestFormats(SampleEstimatorTestBase):
- def test_sample_classifier_dir_fmt(self):
- format = self._custom_setup(sklearn.__version__)
-
- # Should not error
- format.validate()
-
-
-class TestTransformers(SampleEstimatorTestBase):
- def test_old_sklearn_version(self):
- transformer = self.get_transformer(
- SampleEstimatorDirFmt, Pipeline)
- input = self._custom_setup('a very old version')
- with self.assertRaises(ValueError):
- transformer(input)
-
- def test_taxo_class_dir_fmt_to_taxo_class_result(self):
- input = self._custom_setup(sklearn.__version__)
-
- transformer = self.get_transformer(
- SampleEstimatorDirFmt, Pipeline)
- obs = transformer(input)
-
- self.assertTrue(obs)
-
- def test_taxo_class_result_to_taxo_class_dir_fmt(self):
- def read_pipeline(pipeline_filepath):
- with tarfile.open(pipeline_filepath) as tar:
- dirname = tempfile.mkdtemp()
- tar.extractall(dirname)
- pipeline = joblib.load(os.path.join(dirname,
- 'sklearn_pipeline.pkl'))
- for fn in tar.getnames():
- os.unlink(os.path.join(dirname, fn))
- os.rmdir(dirname)
- return pipeline
-
- exp = read_pipeline(self.sklearn_pipeline)
- transformer = self.get_transformer(
- Pipeline, SampleEstimatorDirFmt)
- obs = transformer(exp)
- sklearn_pipeline = obs.sklearn_pipeline.view(PickleFormat)
- obs_pipeline = read_pipeline(str(sklearn_pipeline))
- obs = obs_pipeline
- self.assertTrue(obs)
-
-
-# make sure summarize visualizer works and that rfe_scores are stored properly
-class TestSummarize(SampleEstimatorTestBase):
-
- def test_summary_with_rfecv(self):
- summarize(self.temp_dir.name, self.pipeline)
-
- def test_summary_without_rfecv(self):
- del self.pipeline.rfe_scores
- summarize(self.temp_dir.name, self.pipeline)
-
-
-md = pd.DataFrame([(1, 'a', 0.11), (1, 'a', 0.12), (1, 'a', 0.13),
- (2, 'a', 0.19), (2, 'a', 0.18), (2, 'a', 0.21),
- (1, 'b', 0.14), (1, 'b', 0.13), (1, 'b', 0.14),
- (2, 'b', 0.26), (2, 'b', 0.27), (2, 'b', 0.29)],
- columns=['Time', 'Group', 'Value'])
-
-tab1 = pd.DataFrame([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], columns=['Junk'])
-
-seeded_results = {
- 'RandomForestClassifier': 0.63636363636363635,
- 'ExtraTreesClassifier': 0.454545454545,
- 'GradientBoostingClassifier': 0.272727272727,
- 'AdaBoostClassifier': 0.272727272727,
- 'LinearSVC': 0.727272727273,
- 'SVC': 0.36363636363636365,
- 'KNeighborsClassifier': 0.363636363636,
- 'RandomForestRegressor': 23.226508,
- 'ExtraTreesRegressor': 19.725397,
- 'GradientBoostingRegressor': 34.157100,
- 'AdaBoostRegressor': 30.920635,
- 'Lasso': 722.827623,
- 'Ridge': 123.625210,
- 'ElasticNet': 618.532273,
- 'KNeighborsRegressor': 44.7847619048,
- 'LinearSVR': 511.816385601,
- 'SVR': 51.325146}
-
-seeded_predict_results = {
- 'RandomForestClassifier': 18,
- 'ExtraTreesClassifier': 21,
- 'GradientBoostingClassifier': 21,
- 'AdaBoostClassifier': 21,
- 'LinearSVC': 21,
- 'SVC': 12,
- 'KNeighborsClassifier': 14,
- 'RandomForestRegressor': 7.4246031746,
- 'ExtraTreesRegressor': 0.,
- 'GradientBoostingRegressor': 50.1955883469,
- 'AdaBoostRegressor': 9.7857142857142865,
- 'Lasso': 0.173138653701,
- 'Ridge': 7.57617215386,
- 'ElasticNet': 0.0614243397637,
- 'KNeighborsRegressor': 26.8625396825,
- 'SVR': 37.86704865859832,
- 'LinearSVR': 0.0099912565770459132}
=====================================
q2_sample_classifier/tests/test_estimators.py
=====================================
@@ -0,0 +1,560 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+import pandas as pd
+import pandas.util.testing as pdt
+import biom
+import shutil
+import json
+import numpy as np
+from sklearn.metrics import mean_squared_error, accuracy_score
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import Pipeline
+import skbio
+
+import qiime2
+from q2_types.feature_table import (FeatureTable, PercentileNormalized)
+
+from qiime2.plugins import sample_classifier
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
+from q2_sample_classifier.classify import (
+ regress_samples_ncv, classify_samples_ncv, fit_classifier, fit_regressor,
+ detect_outliers, split_table, predict_classification,
+ predict_regression)
+from q2_sample_classifier.utilities import (
+ _set_parameters_and_estimator, _train_adaboost_base_estimator,
+ _match_series_or_die, _extract_features)
+from q2_sample_classifier import (
+ SampleEstimatorDirFmt, PickleFormat)
+
+
+class SampleEstimatorTestBase(SampleClassifierTestPluginBase):
+ package = 'q2_sample_classifier.tests'
+
+ def setUp(self):
+ super().setUp()
+
+ def _load_biom(table_fp):
+ table_fp = self.get_data_path(table_fp)
+ table = qiime2.Artifact.load(table_fp)
+ table = table.view(biom.Table)
+ return table
+
+ def _load_cmc(md_fp, column):
+ md_fp = self.get_data_path(md_fp)
+ md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
+ md = qiime2.CategoricalMetadataColumn(md[column])
+ return md
+
+ table_chard_fp = _load_biom('chardonnay.table.qza')
+ mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')
+
+ pipeline, importances = fit_classifier(
+ table_chard_fp, mdc_chard_fp, random_state=123,
+ n_estimators=2, n_jobs=1, optimize_feature_selection=True,
+ parameter_tuning=True, missing_samples='ignore')
+ transformer = self.get_transformer(
+ Pipeline, SampleEstimatorDirFmt)
+ self._sklp = transformer(pipeline)
+ sklearn_pipeline = self._sklp.sklearn_pipeline.view(PickleFormat)
+ self.sklearn_pipeline = str(sklearn_pipeline)
+ self.pipeline = pipeline
+
+ def _custom_setup(self, version):
+ with open(os.path.join(self.temp_dir.name,
+ 'sklearn_version.json'), 'w') as fh:
+ fh.write(json.dumps({'sklearn-version': version}))
+ shutil.copy(self.sklearn_pipeline, self.temp_dir.name)
+ return SampleEstimatorDirFmt(
+ self.temp_dir.name, mode='r')
+
+
+class EstimatorsTests(SampleClassifierTestPluginBase):
+
+ def setUp(self):
+ super().setUp()
+
+ def _load_biom(table_fp):
+ table_fp = self.get_data_path(table_fp)
+ table = qiime2.Artifact.load(table_fp)
+ table = table.view(biom.Table)
+ return table
+
+ def _load_md(md_fp):
+ md_fp = self.get_data_path(md_fp)
+ md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
+ md = qiime2.Metadata(md)
+ return md
+
+ def _load_nmc(md_fp, column):
+ md_fp = self.get_data_path(md_fp)
+ md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
+ md = qiime2.NumericMetadataColumn(md[column])
+ return md
+
+ def _load_cmc(md_fp, column):
+ md_fp = self.get_data_path(md_fp)
+ md = pd.read_csv(md_fp, sep='\t', header=0, index_col=0)
+ md = qiime2.CategoricalMetadataColumn(md[column])
+ return md
+
+ self.table_chard_fp = _load_biom('chardonnay.table.qza')
+ self.md_chard_fp = _load_md('chardonnay.map.txt')
+ self.mdc_chard_fp = _load_cmc('chardonnay.map.txt', 'Region')
+ self.table_ecam_fp = _load_biom('ecam-table-maturity.qza')
+ self.md_ecam_fp = _load_md('ecam_map_maturity.txt')
+ self.mdc_ecam_fp = _load_nmc('ecam_map_maturity.txt', 'month')
+ self.exp_imp = pd.read_csv(
+ self.get_data_path('importance.tsv'), sep='\t', header=0,
+ index_col=0)
+ self.exp_pred = pd.read_csv(
+ self.get_data_path('predictions.tsv'), sep='\t', header=0,
+ index_col=0, squeeze=True)
+ index = pd.Index(['A', 'B', 'C', 'D'], name='id')
+ self.table_percnorm = qiime2.Artifact.import_data(
+ FeatureTable[PercentileNormalized], pd.DataFrame(
+ [[20.0, 20.0, 50.0, 10.0], [10.0, 10.0, 70.0, 10.0],
+ [90.0, 8.0, 1.0, 1.0], [30.0, 15.0, 20.0, 35.0]],
+ index=index,
+ columns=['feat1', 'feat2', 'feat3', 'feat4'])).view(biom.Table)
+ self.mdc_percnorm = qiime2.CategoricalMetadataColumn(
+ pd.Series(['X', 'X', 'Y', 'Y'], index=index, name='name'))
+
+ # test feature extraction
+ def test_extract_features(self):
+ table = self.table_ecam_fp
+ dicts = _extract_features(table)
+ dv = DictVectorizer()
+ dv.fit(dicts)
+ features = table.ids('observation')
+ self.assertEqual(set(dv.get_feature_names()), set(features))
+ self.assertEqual(len(dicts), len(table.ids()))
+ for dict_row, (table_row, _, _) in zip(dicts, table.iter()):
+ for feature, count in zip(features, table_row):
+ if count == 0:
+ self.assertTrue(feature not in dict_row)
+ else:
+ self.assertEqual(dict_row[feature], count)
+
+ def test_classify_samples_from_dist(self):
+ # -- setup -- #
+ # 1,2 are a group, 3,4 are a group
+ sample_ids = ('f1', 'f2', 's1', 's2')
+ distance_matrix = skbio.DistanceMatrix([
+ [0, 1, 4, 4],
+ [1, 0, 4, 4],
+ [4, 4, 0, 1],
+ [4, 4, 1, 0],
+ ], ids=sample_ids)
+
+ dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
+ categories = pd.Series(('skinny', 'skinny', 'fat', 'fat'),
+ index=sample_ids[::-1], name='body_mass')
+ categories.index.name = 'SampleID'
+ metadata = qiime2.CategoricalMetadataColumn(categories)
+
+ # -- test -- #
+ res = sample_classifier.actions.classify_samples_from_dist(
+ distance_matrix=dm, metadata=metadata, k=1)
+ pred = res[0].view(pd.Series).sort_values()
+ expected = pd.Series(('fat', 'skinny', 'fat', 'skinny'),
+ index=['f1', 's1', 'f2', 's2'])
+ not_expected = pd.Series(('fat', 'fat', 'fat', 'skinny'),
+ index=sample_ids)
+
+ # order matters for pd.Series.equals()
+ self.assertTrue(expected.sort_index().equals(pred.sort_index()))
+ self.assertFalse(not_expected.sort_index().equals(pred.sort_index()))
+
+ def test_classify_samples_from_dist_with_group_of_single_item(self):
+ # -- setup -- #
+ # 1 is a group, 2,3,4 are a group
+ sample_ids = ('f1', 's1', 's2', 's3')
+ distance_matrix = skbio.DistanceMatrix([
+ [0, 2, 3, 3],
+ [2, 0, 1, 1],
+ [3, 1, 0, 1],
+ [3, 1, 1, 0],
+ ], ids=sample_ids)
+
+ dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
+ categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
+ index=sample_ids, name='body_mass')
+ categories.index.name = 'SampleID'
+ metadata = qiime2.CategoricalMetadataColumn(categories)
+
+ # -- test -- #
+ res = sample_classifier.actions.classify_samples_from_dist(
+ distance_matrix=dm, metadata=metadata, k=1)
+ pred = res[0].view(pd.Series)
+ expected = pd.Series(('skinny', 'skinny', 'skinny', 'skinny'),
+ index=sample_ids)
+
+ self.assertTrue(expected.sort_index().equals(pred.sort_index()))
+
+ def test_2nn(self):
+ # -- setup -- #
+ # 2 nearest neighbors of each sample are
+ # f1: s1, s2 (classified as skinny)
+ # s1: f1, s2 (closer to f1 so fat)
+ # s2: f1, (s1 or s3) (closer to f1 so fat)
+ # s3: s1, s2 (skinny)
+ sample_ids = ('f1', 's1', 's2', 's3')
+ distance_matrix = skbio.DistanceMatrix([
+ [0, 2, 1, 5],
+ [2, 0, 3, 4],
+ [1, 3, 0, 3],
+ [5, 4, 3, 0],
+ ], ids=sample_ids)
+
+ dm = qiime2.Artifact.import_data('DistanceMatrix', distance_matrix)
+ categories = pd.Series(('fat', 'skinny', 'skinny', 'skinny'),
+ index=sample_ids, name='body_mass')
+ categories.index.name = 'SampleID'
+ metadata = qiime2.CategoricalMetadataColumn(categories)
+
+ # -- test -- #
+ res = sample_classifier.actions.classify_samples_from_dist(
+ distance_matrix=dm, metadata=metadata, k=2)
+ pred = res[0].view(pd.Series)
+ expected = pd.Series(('skinny', 'fat', 'fat', 'skinny'),
+ index=sample_ids)
+ self.assertTrue(expected.sort_index().equals(pred.sort_index()))
+
+ # test that each classifier works and delivers an expected accuracy result
+ # when a random seed is set.
+ def test_classifiers(self):
+ for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier',
+ 'GradientBoostingClassifier', 'AdaBoostClassifier',
+ 'LinearSVC', 'SVC', 'KNeighborsClassifier']:
+ table_fp = self.get_data_path('chardonnay.table.qza')
+ table = qiime2.Artifact.load(table_fp)
+ res = sample_classifier.actions.classify_samples(
+ table=table, metadata=self.mdc_chard_fp,
+ test_size=0.5, cv=1, n_estimators=10, n_jobs=1,
+ estimator=classifier, random_state=123,
+ parameter_tuning=False, optimize_feature_selection=False,
+ missing_samples='ignore')
+ pred = res[2].view(pd.Series)
+ pred, truth = _match_series_or_die(
+ pred, self.mdc_chard_fp.to_series(), 'ignore')
+ accuracy = accuracy_score(truth, pred)
+ self.assertAlmostEqual(
+ accuracy, seeded_results[classifier], places=4,
+ msg='Accuracy of %s classifier was %f, but expected %f' % (
+ classifier, accuracy, seeded_results[classifier]))
+
+ # test that the plugin methods/visualizers work
+ def test_regress_samples_ncv(self):
+ y_pred, importances = regress_samples_ncv(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, n_jobs=1, stratify=True, parameter_tuning=True,
+ missing_samples='ignore')
+
+ def test_classify_samples_ncv(self):
+ y_pred, importances, probabilities = classify_samples_ncv(
+ self.table_chard_fp, self.mdc_chard_fp, random_state=123,
+ n_estimators=2, n_jobs=1, missing_samples='ignore')
+
+ # test reproducibility of classifier results, probabilities
+ def test_classify_samples_ncv_accuracy(self):
+ dat = biom.Table(np.array(
+ [[4446, 9828, 3208, 776, 118, 4175, 657, 251, 7505, 617],
+ [1855, 8716, 3257, 1251, 3205, 2557, 4251, 7405, 1417, 1215],
+ [6616, 281, 8616, 291, 261, 253, 9075, 252, 7385, 4068]]),
+ observation_ids=['o1', 'o2', 'o3'],
+ sample_ids=['s1', 's2', 's3', 's4', 's5',
+ 's6', 's7', 's8', 's9', 's10'])
+ md = qiime2.CategoricalMetadataColumn(pd.Series(
+ ['red', 'red', 'red', 'red', 'red',
+ 'blue', 'blue', 'blue', 'blue', 'blue'],
+ index=pd.Index(['s1', 's2', 's3', 's4', 's5',
+ 's6', 's7', 's8', 's9', 's10'],
+ name='sample-id'), name='color'))
+ y_pred, importances, probabilities = classify_samples_ncv(
+ dat, md, random_state=123, n_estimators=2, n_jobs=1,
+ missing_samples='ignore')
+ exp_pred = pd.Series(
+ ['blue', 'red', 'red', 'blue', 'blue',
+ 'blue', 'blue', 'red', 'blue', 'blue'],
+ index=pd.Index(['s4', 's6', 's1', 's10', 's5', 's8', 's2', 's9',
+ 's3', 's7'], dtype='object', name='SampleID'),
+ name='prediction')
+ exp_importances = pd.DataFrame(
+ [0.595111111111111, 0.23155555555555551, 0.17333333333333334],
+ index=pd.Index(['o3', 'o1', 'o2']), columns=['importance'])
+ exp_probabilities = pd.DataFrame(
+ [[0.5, 0.5], [0., 1.], [0., 1.], [0.5, 0.5], [0.5, 0.5],
+ [0.5, 0.5], [0.5, 0.5], [0., 1.], [1., 0.], [1., 0.]],
+ index=pd.Index(['s4', 's6', 's1', 's10', 's5', 's8', 's2', 's9',
+ 's3', 's7'], name='SampleID'),
+ columns=['blue', 'red'])
+ pdt.assert_series_equal(y_pred, exp_pred)
+ pdt.assert_frame_equal(importances, exp_importances)
+ pdt.assert_frame_equal(probabilities, exp_probabilities)
+
+ # test ncv a second time with KNeighborsRegressor (no feature importance)
+ def test_regress_samples_ncv_knn(self):
+ y_pred, importances = regress_samples_ncv(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, n_jobs=1, stratify=False, parameter_tuning=False,
+ estimator='KNeighborsRegressor', missing_samples='ignore')
+
+ # test that ncv gives expected results
+ def test_regress_samples_ncv_accuracy(self):
+ y_pred, importances = regress_samples_ncv(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, n_jobs=1, missing_samples='ignore')
+ pdt.assert_series_equal(y_pred, self.exp_pred)
+ pdt.assert_frame_equal(importances, self.exp_imp)
+
+ # test that fit_* methods output consistent importance scores
+ def test_fit_regressor(self):
+ pipeline, importances = fit_regressor(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, n_jobs=1, missing_samples='ignore')
+ exp_imp = pd.read_csv(
+ self.get_data_path('importance_cv.tsv'), sep='\t', header=0,
+ index_col=0)
+ pdt.assert_frame_equal(importances, exp_imp)
+
+ # just make sure this method runs. Uses the same internal function as
+ # fit_regressor, so importance score consistency is covered by the above
+ # test.
+ def test_fit_classifier(self):
+ pipeline, importances = fit_classifier(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, n_jobs=1, optimize_feature_selection=True,
+ parameter_tuning=True, missing_samples='ignore')
+
+ # test that each regressor works and delivers an expected accuracy result
+ # when a random seed is set.
+ def test_regressors(self):
+ for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor',
+ 'GradientBoostingRegressor', 'AdaBoostRegressor',
+ 'Lasso', 'Ridge', 'ElasticNet',
+ 'KNeighborsRegressor', 'LinearSVR', 'SVR']:
+ table_fp = self.get_data_path('ecam-table-maturity.qza')
+ table = qiime2.Artifact.load(table_fp)
+ res = sample_classifier.actions.regress_samples(
+ table=table, metadata=self.mdc_ecam_fp,
+ test_size=0.5, cv=1, n_estimators=10, n_jobs=1,
+ estimator=regressor, random_state=123,
+ parameter_tuning=False, optimize_feature_selection=False,
+ missing_samples='ignore', stratify=True)
+ pred = res[2].view(pd.Series)
+ pred, truth = _match_series_or_die(
+ pred, self.mdc_ecam_fp.to_series(), 'ignore')
+ accuracy = mean_squared_error(truth, pred)
+ # TODO: Remove this conditional when
+ # https://github.com/qiime2/q2-sample-classifier/issues/193 is
+ # closed
+ if regressor == 'Ridge':
+ self.assertAlmostEqual(
+ accuracy, seeded_results[regressor], places=0,
+ msg='Accuracy of %s regressor was %f, but expected %f' % (
+ regressor, accuracy, seeded_results[regressor]))
+ else:
+ self.assertAlmostEqual(
+ accuracy, seeded_results[regressor], places=4,
+ msg='Accuracy of %s regressor was %f, but expected %f' % (
+ regressor, accuracy, seeded_results[regressor]))
+
+ # test adaboost base estimator trainer
+ def test_train_adaboost_base_estimator(self):
+ abe = _train_adaboost_base_estimator(
+ self.table_chard_fp, self.mdc_chard_fp, 'Region',
+ n_estimators=10, n_jobs=1, cv=3, random_state=None,
+ parameter_tuning=True, classification=True,
+ missing_samples='ignore')
+ self.assertEqual(type(abe.named_steps.est), AdaBoostClassifier)
+
+ # test some invalid inputs/edge cases
+ def test_invalids(self):
+ estimator, pad, pt = _set_parameters_and_estimator(
+ 'RandomForestClassifier', self.table_chard_fp, self.md_chard_fp,
+ 'Region', n_estimators=10, n_jobs=1, cv=1,
+ random_state=123, parameter_tuning=False, classification=True,
+ missing_samples='ignore')
+ regressor, pad, pt = _set_parameters_and_estimator(
+ 'RandomForestRegressor', self.table_chard_fp, self.md_chard_fp,
+ 'Region', n_estimators=10, n_jobs=1, cv=1,
+ random_state=123, parameter_tuning=False, classification=True,
+ missing_samples='ignore')
+
+ def test_split_table_no_rounding_error(self):
+ X_train, X_test = split_table(
+ self.table_chard_fp, self.mdc_chard_fp, test_size=0.5,
+ random_state=123, stratify=True, missing_samples='ignore')
+ self.assertEqual(len(X_train.ids()) + len(X_test.ids()), 21)
+
+ def test_split_table_no_split(self):
+ X_train, X_test = split_table(
+ self.table_chard_fp, self.mdc_chard_fp, test_size=0.0,
+ random_state=123, stratify=True, missing_samples='ignore')
+ self.assertEqual(len(X_train.ids()), 21)
+
+ def test_split_table_invalid_test_size(self):
+ with self.assertRaisesRegex(ValueError, "at least two samples"):
+ X_train, X_test = split_table(
+ self.table_chard_fp, self.mdc_chard_fp, test_size=1.0,
+ random_state=123, stratify=True, missing_samples='ignore')
+
+ def test_split_table_percnorm(self):
+ X_train, X_test = split_table(
+ self.table_percnorm, self.mdc_percnorm, test_size=0.5,
+ random_state=123, stratify=True, missing_samples='ignore')
+ self.assertEqual(len(X_train.ids()) + len(X_test.ids()), 4)
+
+ # test experimental functions
+ def test_detect_outliers(self):
+ detect_outliers(self.table_chard_fp, self.md_chard_fp,
+ random_state=123, n_jobs=1, contamination=0.05)
+
+ def test_detect_outliers_with_subsets(self):
+ detect_outliers(self.table_chard_fp, self.md_chard_fp,
+ random_state=123, n_jobs=1, contamination=0.05,
+ subset_column='Vineyard', subset_value=1)
+
+ def test_detect_outliers_raise_error_on_missing_subset_data(self):
+ with self.assertRaisesRegex(ValueError, "must both be provided"):
+ detect_outliers(self.table_chard_fp, self.md_chard_fp,
+ random_state=123, n_jobs=1, contamination=0.05,
+ subset_column='Vineyard', subset_value=None)
+ with self.assertRaisesRegex(ValueError, "must both be provided"):
+ detect_outliers(self.table_chard_fp, self.md_chard_fp,
+ random_state=123, n_jobs=1, contamination=0.05,
+ subset_column=None, subset_value=1)
+
+ # just test that this works by making sure a classifier trained on samples
+ # x, y, and z predicts the correct metadata values for those same samples.
+ def test_predict_classifications(self):
+ for classifier in ['RandomForestClassifier', 'ExtraTreesClassifier',
+ 'GradientBoostingClassifier', 'AdaBoostClassifier',
+ 'LinearSVC', 'SVC', 'KNeighborsClassifier']:
+ estimator, importances = fit_classifier(
+ self.table_chard_fp, self.mdc_chard_fp, random_state=123,
+ n_estimators=2, estimator=classifier, n_jobs=1,
+ missing_samples='ignore')
+ pred, prob = predict_classification(self.table_chard_fp, estimator)
+ exp = self.mdc_chard_fp.to_series().reindex(pred.index).dropna()
+ # reindex both pred and exp because not all samples present in pred
+ # are present in the metadata! (hence missing_samples='ignore')
+ sample_ids = pred.index.intersection(exp.index)
+ pred = pred.loc[sample_ids]
+ exp = exp.loc[sample_ids]
+ # test that expected number of correct results is achieved (these
+ # are mostly quite high as we would expect (total n=21))
+ correct_results = np.sum(pred == exp)
+ self.assertEqual(
+ correct_results, seeded_predict_results[classifier],
+ msg='Accuracy of %s classifier was %f, but expected %f' % (
+ classifier, correct_results,
+ seeded_predict_results[classifier]))
+
+ def test_predict_regressions(self):
+ for regressor in ['RandomForestRegressor', 'ExtraTreesRegressor',
+ 'GradientBoostingRegressor', 'AdaBoostRegressor',
+ 'Lasso', 'Ridge', 'ElasticNet',
+ 'KNeighborsRegressor', 'SVR', 'LinearSVR']:
+ estimator, importances = fit_regressor(
+ self.table_ecam_fp, self.mdc_ecam_fp, random_state=123,
+ n_estimators=2, estimator=regressor, n_jobs=1,
+ missing_samples='ignore')
+ pred = predict_regression(self.table_ecam_fp, estimator)
+ exp = self.mdc_ecam_fp.to_series()
+ # reindex both pred and exp because not all samples present in pred
+ # are present in the metadata! (hence missing_samples='ignore')
+ sample_ids = pred.index.intersection(exp.index)
+ pred = pred.loc[sample_ids]
+ exp = exp.loc[sample_ids]
+ # test that expected MSE is achieved (these are mostly quite high
+ # as we would expect)
+ mse = mean_squared_error(exp, pred)
+ # TODO: Remove this conditional when
+ # https://github.com/qiime2/q2-sample-classifier/issues/193 is
+ # closed
+ if regressor == 'Ridge':
+ self.assertAlmostEqual(
+ mse, seeded_predict_results[regressor], places=4,
+ msg='Accuracy of %s regressor was %f, but expected %f' % (
+ regressor, mse, seeded_predict_results[regressor]))
+ else:
+ self.assertAlmostEqual(
+ mse, seeded_predict_results[regressor],
+ msg='Accuracy of %s regressor was %f, but expected %f' % (
+ regressor, mse, seeded_predict_results[regressor]))
+
+ # make sure predict still works when features are given in a different
+ # order from training set.
+ def test_predict_feature_order_aint_no_thing(self):
+ table = self.table_ecam_fp
+ estimator, importances = fit_regressor(
+ table, self.mdc_ecam_fp, random_state=123, n_estimators=2,
+ n_jobs=1, missing_samples='ignore')
+
+ # randomly shuffle and reorder features in biom table.
+ feature_ids = table.ids(axis='observation')
+ # look ma no seed! we should get the same result no matter the order.
+ np.random.shuffle(feature_ids)
+ shuffled_table = table.sort_order(feature_ids, axis='observation')
+
+ # now predict values on shuffled data
+ pred = predict_regression(shuffled_table, estimator)
+ exp = self.mdc_ecam_fp.to_series()
+ # reindex both pred and exp because not all samples present in pred
+ # are present in the metadata! (hence missing_samples='ignore')
+ sample_ids = pred.index.intersection(exp.index)
+ pred = pred.loc[sample_ids]
+ exp = exp.loc[sample_ids]
+ # test that expected MSE is achieved (these are mostly quite high
+ # as we would expect)
+ mse = mean_squared_error(exp, pred)
+ self.assertAlmostEqual(
+ mse, seeded_predict_results['RandomForestRegressor'])
+
+
+seeded_results = {
+ 'RandomForestClassifier': 0.63636363636363635,
+ 'ExtraTreesClassifier': 0.454545454545,
+ 'GradientBoostingClassifier': 0.272727272727,
+ 'AdaBoostClassifier': 0.272727272727,
+ 'LinearSVC': 0.727272727273,
+ 'SVC': 0.36363636363636365,
+ 'KNeighborsClassifier': 0.363636363636,
+ 'RandomForestRegressor': 23.226508,
+ 'ExtraTreesRegressor': 19.725397,
+ 'GradientBoostingRegressor': 34.157100,
+ 'AdaBoostRegressor': 30.920635,
+ 'Lasso': 722.827623,
+ 'Ridge': 521.195194222418,
+ 'ElasticNet': 618.532273,
+ 'KNeighborsRegressor': 44.7847619048,
+ 'LinearSVR': 511.816385601,
+ 'SVR': 51.325146}
+
+seeded_predict_results = {
+ 'RandomForestClassifier': 18,
+ 'ExtraTreesClassifier': 21,
+ 'GradientBoostingClassifier': 21,
+ 'AdaBoostClassifier': 21,
+ 'LinearSVC': 21,
+ 'SVC': 12,
+ 'KNeighborsClassifier': 14,
+ 'RandomForestRegressor': 7.4246031746,
+ 'ExtraTreesRegressor': 0.,
+ 'GradientBoostingRegressor': 50.1955883469,
+ 'AdaBoostRegressor': 9.7857142857142865,
+ 'Lasso': 0.173138653701,
+ 'Ridge': 2.694020055323081e-05,
+ 'ElasticNet': 0.0614243397637,
+ 'KNeighborsRegressor': 26.8625396825,
+ 'SVR': 37.86704865859832,
+ 'LinearSVR': 0.0099912565770459132}
=====================================
q2_sample_classifier/tests/test_types_formats_transformers.py
=====================================
@@ -0,0 +1,423 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import os
+import pandas as pd
+import pandas.util.testing as pdt
+import numpy as np
+import shutil
+import tempfile
+import tarfile
+import joblib
+import sklearn
+from sklearn.pipeline import Pipeline
+
+
+import qiime2
+from q2_types.feature_data import FeatureData
+from qiime2.plugin import ValidationError
+from q2_types.sample_data import SampleData
+
+from q2_sample_classifier import (
+ BooleanSeriesFormat, BooleanSeriesDirectoryFormat, BooleanSeries,
+ PredictionsFormat, PredictionsDirectoryFormat, ClassifierPredictions,
+ RegressorPredictions, ImportanceFormat, ImportanceDirectoryFormat,
+ Importance, PickleFormat, ProbabilitiesFormat,
+ ProbabilitiesDirectoryFormat, Probabilities, Classifier, Regressor,
+ SampleEstimator, SampleEstimatorDirFmt)
+from q2_sample_classifier.visuals import (
+ _custom_palettes, _plot_heatmap_from_confusion_matrix,)
+from q2_sample_classifier._format import JSONFormat
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
+from q2_sample_classifier.tests.test_estimators import SampleEstimatorTestBase
+
+
+class TestSemanticTypes(SampleClassifierTestPluginBase):
+
+ def test_boolean_series_format_validate_positive(self):
+ filepath = self.get_data_path('outliers.tsv')
+ format = BooleanSeriesFormat(filepath, mode='r')
+ # These should both just succeed
+ format.validate('min')
+ format.validate('max')
+
+ def test_boolean_series_format_validate_negative_col_count(self):
+ filepath = self.get_data_path('coordinates.tsv')
+ format = BooleanSeriesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'BooleanSeriesFormat'):
+ format.validate()
+
+ def test_boolean_series_format_validate_negative_cell_values(self):
+ filepath = self.get_data_path('predictions.tsv')
+ format = BooleanSeriesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'BooleanSeriesFormat'):
+ format.validate()
+
+ def test_boolean_series_format_validate_negative_empty(self):
+ filepath = self.get_data_path('empty_file.txt')
+ format = BooleanSeriesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'one data record'):
+ format.validate()
+
+ def test_boolean_series_dir_fmt_validate_positive(self):
+ filepath = self.get_data_path('outliers.tsv')
+ shutil.copy(filepath, self.temp_dir.name)
+ format = BooleanSeriesDirectoryFormat(self.temp_dir.name, mode='r')
+ format.validate()
+
+ def test_boolean_series_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(BooleanSeries)
+
+ def test_sample_data_boolean_series_to_boolean_dir_fmt_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleData[BooleanSeries], BooleanSeriesDirectoryFormat)
+
+ def test_pd_series_to_boolean_format(self):
+ transformer = self.get_transformer(pd.Series, BooleanSeriesFormat)
+ exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)
+ exp = pd.Series([True, False, True, False, True, False],
+ name='outlier', index=exp_index)
+ obs = transformer(exp)
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+ squeeze=True)
+ self.assertEqual(sorted(exp), sorted(obs))
+
+ def test_boolean_format_to_pd_series(self):
+ _, obs = self.transform_format(
+ BooleanSeriesFormat, pd.Series, 'outliers.tsv')
+ exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype=object)
+ exp = pd.Series(['True', 'False', 'True', 'False', 'True', 'False'],
+ name='outlier', index=exp_index)
+ self.assertEqual(sorted(exp), sorted(obs))
+
+ def test_boolean_format_to_metadata(self):
+ _, obs = self.transform_format(
+ BooleanSeriesFormat, qiime2.Metadata, 'outliers.tsv')
+
+ exp_index = pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], name='id')
+ exp = pd.DataFrame([['True'], ['False'], ['True'],
+ ['False'], ['True'], ['False']],
+ columns=['outlier'], index=exp_index, dtype='str')
+ exp = qiime2.Metadata(exp)
+ self.assertEqual(obs, exp)
+
+ # test predictions format
+ def test_Predictions_format_validate_positive_numeric_predictions(self):
+ filepath = self.get_data_path('predictions.tsv')
+ format = PredictionsFormat(filepath, mode='r')
+ format.validate(level='min')
+ format.validate()
+
+ def test_Predictions_format_validate_positive_nonnumeric_predictions(self):
+ filepath = self.get_data_path('categorical_predictions.tsv')
+ format = PredictionsFormat(filepath, mode='r')
+ format.validate(level='min')
+ format.validate()
+
+ def test_Predictions_format_validate_negative(self):
+ filepath = self.get_data_path('coordinates.tsv')
+ format = PredictionsFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'PredictionsFormat'):
+ format.validate()
+
+ def test_Predictions_dir_fmt_validate_positive(self):
+ filepath = self.get_data_path('predictions.tsv')
+ shutil.copy(filepath, self.temp_dir.name)
+ format = PredictionsDirectoryFormat(self.temp_dir.name, mode='r')
+ format.validate()
+
+ def test_RegressorPredictions_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(RegressorPredictions)
+
+ def test_ClassifierPredictions_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(ClassifierPredictions)
+
+ def test_RegressorPredictions_to_Predictions_dir_fmt_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleData[RegressorPredictions], PredictionsDirectoryFormat)
+
+ def test_ClassifierPredictions_to_Predictions_dir_fmt_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleData[ClassifierPredictions], PredictionsDirectoryFormat)
+
+ def test_pd_series_to_Predictions_format(self):
+ transformer = self.get_transformer(pd.Series, PredictionsFormat)
+ exp = pd.Series([1, 2, 3, 4],
+ name='prediction', index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+ squeeze=True)
+ pdt.assert_series_equal(obs, exp)
+
+ def test_pd_series_to_Predictions_format_allow_nans(self):
+ transformer = self.get_transformer(pd.Series, PredictionsFormat)
+ exp = pd.Series([1, np.nan, 3, np.nan],
+ name='prediction', index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+ squeeze=True)
+ pdt.assert_series_equal(obs, exp)
+
+ def test_Predictions_format_to_pd_series(self):
+ _, obs = self.transform_format(
+ PredictionsFormat, pd.Series, 'predictions.tsv')
+ exp_index = pd.Index(['10249.C001.10SS', '10249.C002.05SS',
+ '10249.C004.01SS', '10249.C004.11SS'],
+ name='id', dtype=object)
+ exp = pd.Series([4.5, 2.5, 0.5, 4.5], name='prediction',
+ index=exp_index)
+ pdt.assert_series_equal(obs[:4], exp)
+
+ def test_Predictions_format_to_metadata(self):
+ _, obs = self.transform_format(
+ PredictionsFormat, qiime2.Metadata, 'predictions.tsv')
+ exp_index = pd.Index(['10249.C001.10SS', '10249.C002.05SS',
+ '10249.C004.01SS', '10249.C004.11SS'],
+ name='id')
+ exp = pd.DataFrame([4.5, 2.5, 0.5, 4.5], columns=['prediction'],
+ index=exp_index)
+ pdt.assert_frame_equal(obs.to_dataframe()[:4], exp)
+
+ # test Importance format
+ def test_Importance_format_validate_positive(self):
+ filepath = self.get_data_path('importance.tsv')
+ format = ImportanceFormat(filepath, mode='r')
+ format.validate(level='min')
+ format.validate()
+
+ def test_Importance_format_validate_negative_nonnumeric(self):
+ filepath = self.get_data_path('chardonnay.map.txt')
+ format = ImportanceFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'numeric values'):
+ format.validate()
+
+ def test_Importance_format_validate_negative_empty(self):
+ filepath = self.get_data_path('empty_file.txt')
+ format = ImportanceFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'one data record'):
+ format.validate()
+
+ def test_Importance_format_validate_negative(self):
+ filepath = self.get_data_path('garbage.txt')
+ format = ImportanceFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'two or more fields'):
+ format.validate()
+
+ def test_Importance_dir_fmt_validate_positive(self):
+ filepath = self.get_data_path('importance.tsv')
+ shutil.copy(filepath, self.temp_dir.name)
+ format = ImportanceDirectoryFormat(self.temp_dir.name, mode='r')
+ format.validate()
+
+ def test_Importance_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(Importance)
+
+ def test_sample_data_Importance_to_Importance_dir_fmt_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ FeatureData[Importance], ImportanceDirectoryFormat)
+
+ def test_pd_dataframe_to_Importance_format(self):
+ transformer = self.get_transformer(pd.DataFrame, ImportanceFormat)
+ exp = pd.DataFrame([1, 2, 3, 4],
+ columns=['importance'], index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0)
+ pdt.assert_frame_equal(exp, obs)
+
+ def test_Importance_format_to_pd_dataframe(self):
+ _, obs = self.transform_format(
+ ImportanceFormat, pd.DataFrame, 'importance.tsv')
+ exp_index = pd.Index(['74ec9fe6ffab4ecff6d5def74298a825',
+ 'c82032c40c98975f71892e4be561c87a',
+ '79280cea51a6fe8a3432b2f266dd34db',
+ 'f7686a74ca2d3729eb66305e8a26309b'],
+ name='id')
+ exp = pd.DataFrame([0.44469828320835586, 0.07760118417569697,
+ 0.06570251750505914, 0.061718558716901406],
+ columns=['importance'],
+ index=exp_index)
+ pdt.assert_frame_equal(exp, obs[:4])
+
+ def test_Importance_format_to_metadata(self):
+ _, obs = self.transform_format(
+ ImportanceFormat, qiime2.Metadata, 'importance.tsv')
+ exp_index = pd.Index(['74ec9fe6ffab4ecff6d5def74298a825',
+ 'c82032c40c98975f71892e4be561c87a',
+ '79280cea51a6fe8a3432b2f266dd34db',
+ 'f7686a74ca2d3729eb66305e8a26309b'],
+ name='id')
+ exp = pd.DataFrame([0.44469828320835586, 0.07760118417569697,
+ 0.06570251750505914, 0.061718558716901406],
+ columns=['importance'],
+ index=exp_index)
+ pdt.assert_frame_equal(obs.to_dataframe()[:4], exp)
+
+ # test Probabilities format
+ def test_Probabilities_format_validate_positive(self):
+ filepath = self.get_data_path('class_probabilities.tsv')
+ format = ProbabilitiesFormat(filepath, mode='r')
+ format.validate(level='min')
+ format.validate()
+
+ def test_Probabilities_format_validate_negative_nonnumeric(self):
+ filepath = self.get_data_path('chardonnay.map.txt')
+ format = ProbabilitiesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'numeric values'):
+ format.validate()
+
+ def test_Probabilities_format_validate_negative_empty(self):
+ filepath = self.get_data_path('empty_file.txt')
+ format = ProbabilitiesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'one data record'):
+ format.validate()
+
+ def test_Probabilities_format_validate_negative(self):
+ filepath = self.get_data_path('garbage.txt')
+ format = ProbabilitiesFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'two or more fields'):
+ format.validate()
+
+ def test_Probabilities_dir_fmt_validate_positive(self):
+ filepath = self.get_data_path('class_probabilities.tsv')
+ shutil.copy(filepath, self.temp_dir.name)
+ format = ProbabilitiesDirectoryFormat(self.temp_dir.name, mode='r')
+ format.validate()
+
+ def test_Probabilities_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(Probabilities)
+
+ def test_sample_data_Probabilities_to_Probs_dir_fmt_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleData[Probabilities], ProbabilitiesDirectoryFormat)
+
+ def test_pd_dataframe_to_Probabilities_format(self):
+ transformer = self.get_transformer(pd.DataFrame, ProbabilitiesFormat)
+ exp = pd.DataFrame([[0.1, 0.77], [0.8, 0.4], [0.7, 0.1], [0.44, 0.73]],
+ columns=['classA', 'classB'],
+ index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+ parse_dates=True)
+ pdt.assert_frame_equal(exp, obs)
+
+ def test_Probabilities_format_to_pd_dataframe(self):
+ _, obs = self.transform_format(
+ ProbabilitiesFormat, pd.DataFrame, 'class_probabilities.tsv')
+ exp_index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
+ name='id')
+ exp = pd.DataFrame([[0.4446, 0.9828, 0.3208],
+ [0.0776, 0.0118, 0.4175],
+ [0.0657, 0.0251, 0.7505],
+ [0.0617, 0.1855, 0.8716],
+ [0.0281, 0.8616, 0.0291],
+ [0.0261, 0.0253, 0.9075],
+ [0.0252, 0.7385, 0.4068]],
+ columns=['classA', 'classB', 'classC'],
+ index=exp_index)
+ pdt.assert_frame_equal(exp, obs)
+
+ def test_Probabilities_format_to_metadata(self):
+ _, obs = self.transform_format(
+ ProbabilitiesFormat, qiime2.Metadata, 'class_probabilities.tsv')
+ exp_index = pd.Index(['s1', 's2', 's3', 's4', 's5', 's6', 's7'],
+ name='id')
+ exp = pd.DataFrame([[0.4446, 0.9828, 0.3208],
+ [0.0776, 0.0118, 0.4175],
+ [0.0657, 0.0251, 0.7505],
+ [0.0617, 0.1855, 0.8716],
+ [0.0281, 0.8616, 0.0291],
+ [0.0261, 0.0253, 0.9075],
+ [0.0252, 0.7385, 0.4068]],
+ columns=['classA', 'classB', 'classC'],
+ index=exp_index)
+ pdt.assert_frame_equal(obs.to_dataframe(), exp)
+
+ # test utility formats
+ def test_pickle_format_validate_negative(self):
+ filepath = self.get_data_path('coordinates.tsv')
+ format = PickleFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'pickled file'):
+ format.validate()
+
+ def test_json_format_validate_negative(self):
+ filepath = self.get_data_path('coordinates.tsv')
+ format = JSONFormat(filepath, mode='r')
+ with self.assertRaisesRegex(ValidationError, 'Expecting value'):
+ format.validate()
+
+ # this just checks that palette names are valid input
+ def test_custom_palettes(self):
+ confused = np.array([[1, 0], [0, 1]])
+ for palette in _custom_palettes().keys():
+ _plot_heatmap_from_confusion_matrix(confused, palette)
+
+
+class TestTypes(SampleClassifierTestPluginBase):
+ def test_sample_estimator_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(SampleEstimator)
+
+ def test_classifier_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(Classifier)
+
+ def test_regressor_semantic_type_registration(self):
+ self.assertRegisteredSemanticType(Regressor)
+
+ def test_sample_classifier_semantic_type_to_format_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleEstimator[Classifier], SampleEstimatorDirFmt)
+
+ def test_sample_regressor_semantic_type_to_format_registration(self):
+ self.assertSemanticTypeRegisteredToFormat(
+ SampleEstimator[Regressor], SampleEstimatorDirFmt)
+
+
+class TestFormats(SampleEstimatorTestBase):
+ def test_sample_classifier_dir_fmt(self):
+ format = self._custom_setup(sklearn.__version__)
+
+ # Should not error
+ format.validate()
+
+
+class TestTransformers(SampleEstimatorTestBase):
+ def test_old_sklearn_version(self):
+ transformer = self.get_transformer(
+ SampleEstimatorDirFmt, Pipeline)
+ input = self._custom_setup('a very old version')
+ with self.assertRaises(ValueError):
+ transformer(input)
+
+ def test_taxo_class_dir_fmt_to_taxo_class_result(self):
+ input = self._custom_setup(sklearn.__version__)
+
+ transformer = self.get_transformer(
+ SampleEstimatorDirFmt, Pipeline)
+ obs = transformer(input)
+
+ self.assertTrue(obs)
+
+ def test_taxo_class_result_to_taxo_class_dir_fmt(self):
+ def read_pipeline(pipeline_filepath):
+ with tarfile.open(pipeline_filepath) as tar:
+ dirname = tempfile.mkdtemp()
+ tar.extractall(dirname)
+ pipeline = joblib.load(os.path.join(dirname,
+ 'sklearn_pipeline.pkl'))
+ for fn in tar.getnames():
+ os.unlink(os.path.join(dirname, fn))
+ os.rmdir(dirname)
+ return pipeline
+
+ exp = read_pipeline(self.sklearn_pipeline)
+ transformer = self.get_transformer(
+ Pipeline, SampleEstimatorDirFmt)
+ obs = transformer(exp)
+ sklearn_pipeline = obs.sklearn_pipeline.view(PickleFormat)
+ obs_pipeline = read_pipeline(str(sklearn_pipeline))
+ obs = obs_pipeline
+ self.assertTrue(obs)
=====================================
q2_sample_classifier/tests/test_utilities.py
=====================================
@@ -0,0 +1,154 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import biom
+import numpy as np
+from sklearn.svm import LinearSVC
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+import pandas.util.testing as pdt
+
+import qiime2
+
+from q2_sample_classifier.utilities import (
+ _load_data, _calculate_feature_importances, _extract_important_features,
+ _disable_feature_selection, _mean_feature_importance,
+ _null_feature_importance, _extract_features)
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
+
+
+class UtilitiesTests(SampleClassifierTestPluginBase):
+
+ def setUp(self):
+ super().setUp()
+
+ exp_rf = pd.DataFrame(
+ {'importance': [0.1, 0.2, 0.3]}, index=['a', 'b', 'c'])
+ exp_rf.index.name = 'feature'
+ self.exp_rf = exp_rf
+
+ exp_svm = pd.DataFrame(
+ {'importance0': [0.1, 0.2, 0.3], 'importance1': [0.4, 0.5, 0.6]},
+ index=['a', 'b', 'c'])
+ exp_svm.index.name = 'feature'
+ self.exp_svm = exp_svm
+
+ exp_lsvm = pd.DataFrame(
+ {'importance0': [-0.048794, -0.048794, -0.048794]},
+ index=['a', 'b', 'c'])
+ exp_lsvm.index.name = 'feature'
+ self.exp_lsvm = exp_lsvm
+
+ self.features = biom.Table(np.array([[1]*5]*3), ['a', 'b', 'c'],
+ list(map(str, range(5))))
+
+ self.targets = pd.Series(['a', 'a', 'b', 'b', 'a'], name='bullseye')
+
+ def test_extract_important_features_1d_array(self):
+ importances = _extract_important_features(
+ self.features.ids('observation'),
+ np.ndarray((3,), buffer=np.array([0.1, 0.2, 0.3])))
+ self.assertEqual(sorted(self.exp_rf), sorted(importances))
+
+ def test_extract_important_features_2d_array(self):
+ importances = _extract_important_features(
+ self.features.ids('observation'),
+ np.ndarray(
+ (2, 3), buffer=np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])))
+ self.assertEqual(sorted(self.exp_svm), sorted(importances))
+
+ # test feature importance calculation with main classifier types
+ def test_calculate_feature_importances_ensemble(self):
+ estimator = Pipeline(
+ [('dv', DictVectorizer()),
+ ('est', RandomForestClassifier(n_estimators=10))])
+ estimator.fit(_extract_features(self.features),
+ self.targets.values.ravel())
+ fi = _calculate_feature_importances(estimator)
+ self.assertEqual(sorted(self.exp_rf), sorted(fi))
+
+ def test_calculate_feature_importances_svm(self):
+ estimator = Pipeline(
+ [('dv', DictVectorizer()), ('est', LinearSVC())])
+ estimator.fit(_extract_features(self.features),
+ self.targets.values.ravel())
+ fi = _calculate_feature_importances(estimator)
+ self.assertEqual(sorted(self.exp_lsvm), sorted(fi))
+
+ # confirm that feature selection incompatibility warnings work
+ def test_disable_feature_selection_unsupported(self):
+ with self.assertWarnsRegex(UserWarning, "does not support recursive"):
+ _disable_feature_selection('KNeighborsClassifier', False)
+
+ def test_mean_feature_importance_1d_arrays(self):
+ exp = pd.DataFrame([10, 9, 8, 7], columns=["importance0"],
+ index=[3, 2, 1, 0])
+ imps = [pd.DataFrame([1, 2, 3, 4], columns=["importance0"]),
+ pd.DataFrame([5, 6, 7, 8], columns=["importance0"]),
+ pd.DataFrame([9, 10, 11, 12], columns=["importance0"]),
+ pd.DataFrame([13, 14, 15, 16], columns=["importance0"])]
+ pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+
+ def test_mean_feature_importance_different_column_names(self):
+ exp = pd.DataFrame([[6, 5, 4, 3], [14, 13, 12, 11]],
+ index=["importance0", "importance1"],
+ columns=[3, 2, 1, 0]).T
+ imps = [pd.DataFrame([1, 2, 3, 4], columns=["importance0"]),
+ pd.DataFrame([5, 6, 7, 8], columns=["importance0"]),
+ pd.DataFrame([9, 10, 11, 12], columns=["importance1"]),
+ pd.DataFrame([13, 14, 15, 16], columns=["importance1"])]
+ pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+
+ def test_mean_feature_importance_2d_arrays(self):
+ exp = pd.DataFrame([[3.5] * 4, [9.5] * 4],
+ index=["importance0", "importance1"],
+ columns=[0, 1, 2, 3]).T
+ imps = [pd.DataFrame([[6, 5, 4, 3], [14, 13, 12, 11]],
+ index=["importance0", "importance1"],
+ columns=[0, 1, 2, 3]).T,
+ pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]],
+ index=["importance0", "importance1"],
+ columns=[0, 1, 2, 3]).T]
+ pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+
+ # and this should not occur now, but theoretically should just concat and
+ # sort but not collapse if all column names are unique
+ def test_mean_feature_importance_do_not_collapse(self):
+ imps = [pd.DataFrame([4, 3, 2, 1], columns=["importance0"]),
+ pd.DataFrame([16, 15, 14, 13], columns=["importance1"])]
+ exp = pd.concat(imps, axis=1)
+ pdt.assert_frame_equal(_mean_feature_importance(imps), exp)
+
+ def test_null_feature_importance(self):
+ exp = pd.DataFrame(
+ [1, 1, 1], index=['o1', 'o2', 'o3'], columns=['importance'])
+ exp.index.name = 'feature'
+ tab = biom.Table(np.array([[1., 2., 3.], [3., 2., 1.], [7., 6., 9.]]),
+ ['o1', 'o2', 'o3'], ['s1', 's2', 's3'])
+ tab = _extract_features(tab)
+ pdt.assert_frame_equal(_null_feature_importance(tab), exp)
+
+ def test_load_data(self):
+ # phony feature table
+ id_map = {'0': 'peanut', '1': 'bugs', '2': 'qiime2', '3': 'matt',
+ '4': 'pandas'}
+ a = self.features.update_ids(id_map, axis='sample')
+ # phony metadata, convert to qiime2.Metadata
+ b = self.targets
+ b.index = ['pandas', 'peanut', 'qiime1', 'flapjacks', 'bugs']
+ b.index.name = '#SampleID'
+ b = qiime2.Metadata(b.to_frame())
+ # test that merge of tables is inner merge
+ intersection = set(('peanut', 'bugs', 'pandas'))
+ feature_data, targets = _load_data(a, b, missing_samples='ignore')
+ exp = [{'c': 1.0, 'a': 1.0, 'b': 1.0}, {'c': 1.0, 'a': 1.0, 'b': 1.0},
+ {'c': 1.0, 'a': 1.0, 'b': 1.0}]
+ np.testing.assert_array_equal(feature_data, exp)
+ self.assertEqual(set(targets.index), intersection)
=====================================
q2_sample_classifier/tests/test_visualization.py
=====================================
@@ -0,0 +1,237 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2017-2020, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+import pandas as pd
+import pandas.util.testing as pdt
+from os import mkdir, listdir
+from os.path import join
+import biom
+
+import qiime2
+from qiime2.plugins import sample_classifier
+
+from q2_sample_classifier.visuals import (
+ _linear_regress, _calculate_baseline_accuracy,
+ _add_sample_size_to_xtick_labels)
+from q2_sample_classifier.classify import (
+ scatterplot, confusion_matrix)
+from q2_sample_classifier.utilities import (
+ _match_series_or_die, _predict_and_plot)
+from q2_sample_classifier.tests.test_base_class import \
+ SampleClassifierTestPluginBase
+
+
+class TestVisuals(SampleClassifierTestPluginBase):
+
+ md = pd.DataFrame([(1, 'a', 0.11), (1, 'a', 0.12), (1, 'a', 0.13),
+ (2, 'a', 0.19), (2, 'a', 0.18), (2, 'a', 0.21),
+ (1, 'b', 0.14), (1, 'b', 0.13), (1, 'b', 0.14),
+ (2, 'b', 0.26), (2, 'b', 0.27), (2, 'b', 0.29)],
+ columns=['Time', 'Group', 'Value'])
+
+ def test_linear_regress(self):
+ res = _linear_regress(self.md['Value'], self. md['Time'])
+ self.assertAlmostEqual(res.iloc[0]['Mean squared error'], 1.9413916666)
+ self.assertAlmostEqual(res.iloc[0]['r-value'], 0.86414956372460128)
+ self.assertAlmostEqual(res.iloc[0]['r-squared'], 0.74675446848541871)
+ self.assertAlmostEqual(res.iloc[0]['P-value'], 0.00028880275858705694)
+
+ def test_calculate_baseline_accuracy(self):
+ accuracy = 0.9
+ y_test = pd.Series(['a', 'a', 'a', 'b', 'b', 'b'], name="class")
+ classifier_accuracy = _calculate_baseline_accuracy(y_test, accuracy)
+ expected_results = (6, 3, 0.5, 1.8)
+ for i in zip(classifier_accuracy, expected_results):
+ self.assertEqual(i[0], i[1])
+
+
+class TestHeatmap(SampleClassifierTestPluginBase):
+
+ def setUp(self):
+ super().setUp()
+ md_vaw = self.get_data_path('vaw.txt')
+ md_vaw = qiime2.Metadata.load(md_vaw)
+ self.md_vaw = md_vaw.get_column('Column')
+ table_vaw = self.get_data_path('vaw.qza')
+ self.table_vaw = qiime2.Artifact.load(table_vaw)
+ imp = pd.read_csv(
+ self.get_data_path('vaw_importance.tsv'), sep='\t',
+ header=0, index_col=0)
+ self.imp = qiime2.Artifact.import_data('FeatureData[Importance]', imp)
+
+ def test_heatmap_default_feature_count_zero(self):
+ heatmap, table, = sample_classifier.actions.heatmap(
+ self.table_vaw, self.imp, self.md_vaw, group_samples=True,
+ feature_count=0)
+ self.assertEqual(table.view(biom.Table).shape, (5, 2))
+
+ def test_heatmap_importance_threshold(self):
+ heatmap, table, = sample_classifier.actions.heatmap(
+ self.table_vaw, self.imp, self.md_vaw,
+ importance_threshold=0.062, group_samples=False, feature_count=0)
+ self.assertEqual(table.view(biom.Table).shape, (3, 6))
+
+ def test_heatmap_feature_count(self):
+ heatmap, table, = sample_classifier.actions.heatmap(
+ self.table_vaw, self.imp, self.md_vaw, group_samples=True,
+ feature_count=2)
+ self.assertEqual(table.view(biom.Table).shape, (2, 2))
+
+ def test_heatmap_must_group_or_die(self):
+ with self.assertRaisesRegex(ValueError, "metadata are not optional"):
+ heatmap, table, = sample_classifier.actions.heatmap(
+ self.table_vaw, self.imp, sample_metadata=None,
+ group_samples=True)
+
+
+# This class really just checks that these visualizers run without error. Yay.
+# Also test some internal nuts/bolts but there's not much else we can do..
+class TestPlottingVisualizers(SampleClassifierTestPluginBase):
+ def setUp(self):
+ super().setUp()
+ self.tmpd = join(self.temp_dir.name, 'viz')
+ mkdir(self.tmpd)
+
+ self.a = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='site',
+ index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
+ self.a.index.name = 'SampleID'
+ self.bogus = pd.Series(['a', 'a', 'b', 'b', 'c', 'c'], name='site',
+ index=['a1', 'e3', 'f5', 'b2', 'z1', 'c2'])
+ self.bogus.index.name = 'SampleID'
+ self.c = pd.Series(
+ [0, 1, 2, 3], index=['a', 'b', 'c', 'd'], name='peanuts')
+ self.c.index.name = 'SampleID'
+
+ def test_confusion_matrix(self):
+ b = qiime2.CategoricalMetadataColumn(self.a)
+ confusion_matrix(self.tmpd, self.a, b)
+
+ def test_confusion_matrix_class_overlap_error(self):
+ b = pd.Series([1, 2, 3, 4, 5, 6], name='site',
+ index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
+ b.index.name = 'id'
+ b = qiime2.NumericMetadataColumn(b)
+ with self.assertRaisesRegex(ValueError, "do not overlap"):
+ confusion_matrix(self.tmpd, self.a, b)
+
+ def test_confusion_matrix_vmin_too_high(self):
+ b = qiime2.CategoricalMetadataColumn(self.a)
+ with self.assertRaisesRegex(ValueError, r'vmin must be less than.*\s\s'
+ r'0\.5.*greater.*0\.0'):
+ confusion_matrix(self.tmpd, self.a, b, vmin=.5, vmax=None)
+
+ def test_confusion_matrix_vmax_too_low(self):
+ b = qiime2.CategoricalMetadataColumn(self.a)
+ with self.assertRaisesRegex(ValueError, r'vmax must be greater than.*'
+ r'\s\s0\.5.*less.*1\.0'):
+ confusion_matrix(self.tmpd, self.a, b, vmin=None, vmax=.5)
+
+ def test_confusion_matrix_vmin_too_high_and_vmax_too_low(self):
+ b = qiime2.CategoricalMetadataColumn(self.a)
+ with self.assertRaisesRegex(ValueError, r'vmin must be less than.*\s'
+ r'\s0\.5.*greater.*0\.0\s.*vmax must be '
+ r'greater than.*\s\s0\.5.*less.*1\.0'):
+ confusion_matrix(self.tmpd, self.a, b, vmin=.5, vmax=.5)
+
+ def test_confusion_matrix_dtype_coercion(self):
+ predictions = pd.Series([1, 1, 1, 2, 2, 2],
+ index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'],
+ name='sample_id'), name='features')
+
+ # NOTE: the targets are numbers but represented as str
+ truth = qiime2.CategoricalMetadataColumn(pd.Series(
+ ['1', '2', '1', '2', '1', '2'],
+ index=pd.Index(['a', 'b', 'c', 'd', 'e', 'f'], name='sample-id'),
+ name='target'))
+
+ confusion_matrix(self.tmpd, predictions, truth)
+
+ self.assertTrue('index.html' in listdir(self.tmpd))
+
+ # test confusion matrix plotting independently to see how it handles
+ # partially overlapping classes when true labels are superset
+ def test_predict_and_plot_true_labels_are_superset(self):
+ b = pd.Series(['a', 'a', 'b', 'b', 'b', 'b'], name='site',
+ index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
+ exp = pd.DataFrame(
+ [[1., 0., 0., ''],
+ [0., 1., 0., ''],
+ [0., 1., 0., ''],
+ ['', '', '', 0.666666666],
+ ['', '', '', 0.3333333333],
+ ['', '', '', 2.]],
+ columns=['a', 'b', 'c', 'Overall Accuracy'],
+ index=['a', 'b', 'c', 'Overall Accuracy', 'Baseline Accuracy',
+ 'Accuracy Ratio'])
+ predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
+ pdt.assert_frame_equal(exp, predictions)
+
+ # test confusion matrix plotting independently to see how it handles
+ # partially overlapping classes when true labels are superset
+ def test_predict_and_plot_true_labels_are_subset(self):
+ b = pd.Series(['a', 'a', 'b', 'b', 'c', 'd'], name='site',
+ index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
+ exp = pd.DataFrame(
+ [[1., 0., 0., 0., ''],
+ [0., 1., 0., 0., ''],
+ [0., 0., 0.5, 0.5, ''],
+ [0., 0., 0., 0., ''],
+ ['', '', '', '', 0.8333333333],
+ ['', '', '', '', 0.3333333333],
+ ['', '', '', '', 2.5]],
+ columns=['a', 'b', 'c', 'd', 'Overall Accuracy'],
+ index=['a', 'b', 'c', 'd', 'Overall Accuracy', 'Baseline Accuracy',
+ 'Accuracy Ratio'])
+ predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
+ pdt.assert_frame_equal(exp, predictions)
+
+ # test confusion matrix plotting independently to see how it handles
+ # partially overlapping classes when true labels are mutually exclusive
+ def test_predict_and_plot_true_labels_are_mutually_exclusive(self):
+ b = pd.Series(['a', 'a', 'e', 'e', 'd', 'd'], name='site',
+ index=['a1', 'a2', 'b1', 'b2', 'c1', 'c2'])
+ exp = pd.DataFrame(
+ [[1., 0., 0., 0., 0., ''],
+ [0., 0., 0., 0., 1., ''],
+ [0., 0., 0., 1., 0., ''],
+ [0., 0., 0., 0., 0., ''],
+ [0., 0., 0., 0., 0., ''],
+ ['', '', '', '', '', 0.3333333333],
+ ['', '', '', '', '', 0.3333333333],
+ ['', '', '', '', '', 1.]],
+ columns=['a', 'b', 'c', 'd', 'e', 'Overall Accuracy'],
+ index=['a', 'b', 'c', 'd', 'e', 'Overall Accuracy',
+ 'Baseline Accuracy', 'Accuracy Ratio'])
+ predictions, confusion = _predict_and_plot(self.tmpd, self.a, b)
+ pdt.assert_frame_equal(exp, predictions)
+
+ def test_scatterplot(self):
+ b = qiime2.NumericMetadataColumn(self.c)
+ scatterplot(self.tmpd, self.c, b)
+
+ def test_add_sample_size_to_xtick_labels(self):
+ labels = _add_sample_size_to_xtick_labels(self.a, ['a', 'b', 'c'])
+ exp = ['a (n=2)', 'b (n=2)', 'c (n=2)']
+ self.assertListEqual(labels, exp)
+
+ # now test performance when extra classes are present
+ def test_add_sample_size_to_xtick_labels_extra_classes(self):
+ labels = _add_sample_size_to_xtick_labels(
+ self.a, [0, 'a', 'b', 'bb', 'c'])
+ exp = ['0 (n=0)', 'a (n=2)', 'b (n=2)', 'bb (n=0)', 'c (n=2)']
+ self.assertListEqual(labels, exp)
+
+ def test_match_series_or_die(self):
+ exp = pd.Series(['a', 'b', 'c'], name='site', index=['a1', 'b2', 'c2'])
+ exp.index.name = 'SampleID'
+ a, b = _match_series_or_die(self.a, self.bogus, 'ignore')
+ pdt.assert_series_equal(exp, a)
+ pdt.assert_series_equal(exp, b)
+
+ def test_match_series_or_die_missing_samples(self):
+ with self.assertRaisesRegex(ValueError, "Missing samples"):
+ a, b = _match_series_or_die(self.a, self.bogus, 'error')
=====================================
q2_sample_classifier/utilities.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -535,7 +535,7 @@ def _summarize_estimator(output_dir, sample_estimator):
def _visualize(output_dir, estimator, cm, roc,
optimize_feature_selection=True, title='results'):
- pd.set_option('display.max_colwidth', -1)
+ pd.set_option('display.max_colwidth', None)
# summarize model accuracy and params
if estimator is not None:
@@ -694,7 +694,7 @@ def _mean_feature_importance(importances):
containing importance scores of the same features from multiple models
(e.g., CV importance scores).
'''
- imp = pd.concat(importances, axis=1)
+ imp = pd.concat(importances, axis=1, sort=True)
# groupby column name instead of taking column mean to support 2d arrays
imp = imp.groupby(imp.columns, axis=1).mean()
return imp.sort_values(imp.columns[0], ascending=False)
=====================================
q2_sample_classifier/visuals.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
@@ -101,8 +101,28 @@ def _linear_regress(actual, pred):
def _plot_heatmap_from_confusion_matrix(cm, palette, vmin=None, vmax=None):
palette = _custom_palettes()[palette]
- return sns.heatmap(cm, vmin=vmin, vmax=vmax, cmap=palette,
- cbar_kws={'label': 'Proportion'})
+ plt.figure()
+ scaler, labelsize, dpi, cbar_min = 20, 8, 100, .15
+ sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize,
+ 'figure.dpi': dpi})
+ fig, (ax, cax) = plt.subplots(ncols=2, constrained_layout=True)
+ heatmap = sns.heatmap(cm, vmin=vmin, vmax=vmax, cmap=palette, ax=ax,
+ cbar_ax=cax, cbar_kws={'label': 'Proportion'},
+ square=True, xticklabels=True, yticklabels=True)
+
+ # Resize the plot dynamically based on number of classes
+ hm_pos = ax.get_position()
+ scale = len(cm) / scaler
+ # prevent cbar from getting unreadably small
+ cbar_height = max(cbar_min, scale)
+ ax.set_position([hm_pos.x0, hm_pos.y0, scale, scale])
+ cax.set_position([hm_pos.x0 + scale * .95, hm_pos.y0, scale / len(cm),
+ cbar_height])
+
+ # Make the heatmap subplot (not the colorbar) the active axis object so
+ # labels apply correctly on return
+ plt.sca(ax)
+ return heatmap
def _add_sample_size_to_xtick_labels(ser, classes):
=====================================
setup.py
=====================================
@@ -1,5 +1,5 @@
# ----------------------------------------------------------------------------
-# Copyright (c) 2017-2019, QIIME 2 development team.
+# Copyright (c) 2017-2020, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
View it on GitLab: https://salsa.debian.org/med-team/q2-sample-classifier/-/commit/31dd5bad629cc7b9cdae72b5f1006c825981f5ca
--
View it on GitLab: https://salsa.debian.org/med-team/q2-sample-classifier/-/commit/31dd5bad629cc7b9cdae72b5f1006c825981f5ca
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20201202/bbbba803/attachment-0001.html>
More information about the debian-med-commit
mailing list