[med-svn] [Git][med-team/q2-sample-classifier][master] 7 commits: Autopkgtest for all supported Python3 versions
Andreas Tille (@tille)
gitlab at salsa.debian.org
Sun Feb 18 17:03:30 GMT 2024
Andreas Tille pushed to branch master at Debian Med / q2-sample-classifier
Commits:
4f306161 by Andreas Tille at 2024-02-18T15:53:38+01:00
Autopkgtest for all supported Python3 versions
- - - - -
4f644668 by Andreas Tille at 2024-02-18T15:53:56+01:00
New upstream version 2024.2.0
- - - - -
439f3d73 by Andreas Tille at 2024-02-18T15:53:56+01:00
routine-update: New upstream version
- - - - -
a6aa9179 by Andreas Tille at 2024-02-18T15:53:57+01:00
Update upstream source from tag 'upstream/2024.2.0'
Update to upstream version '2024.2.0'
with Debian dir d09b2ecedb8bc65f1f2585907c981ba7fe5bbae7
- - - - -
08a47986 by Andreas Tille at 2024-02-18T15:54:18+01:00
routine-update: Regenerate debian/control from debian/control.in
- - - - -
a7bae32a by Andreas Tille at 2024-02-18T16:23:20+01:00
Refresh patches + port to Pandas 2.0
- - - - -
11dcb446 by Andreas Tille at 2024-02-18T17:30:49+01:00
Upload to unstable
- - - - -
16 changed files:
- .github/workflows/ci-dev.yaml
- README.md
- debian/changelog
- debian/control
- debian/patches/fix-autopkgtest.patch
- + debian/patches/pandas2.0.patch
- debian/patches/series
- debian/patches/sklearn-1.2.1.patch
- debian/tests/control
- debian/tests/run-unit-test
- q2_sample_classifier/_transformer.py
- q2_sample_classifier/_version.py
- q2_sample_classifier/classify.py
- q2_sample_classifier/plugin_setup.py
- q2_sample_classifier/tests/test_types_formats_transformers.py
- q2_sample_classifier/utilities.py
Changes:
=====================================
.github/workflows/ci-dev.yaml
=====================================
@@ -9,4 +9,4 @@ jobs:
ci:
uses: qiime2/distributions/.github/workflows/lib-ci-dev.yaml at dev
with:
- distro: core
\ No newline at end of file
+ distro: amplicon
=====================================
README.md
=====================================
@@ -1,5 +1,5 @@
# q2-sample-classifier
-
+
This is a QIIME 2 plugin. For details on QIIME 2, see https://qiime2.org.
\ No newline at end of file
=====================================
debian/changelog
=====================================
@@ -1,3 +1,13 @@
+q2-sample-classifier (2024.2.0-1) unstable; urgency=medium
+
+ * Team upload.
+ * New upstream version
+ * Autopkgtest for all supported Python3 versions
+ * Regenerate debian/control from debian/control.in (routine-update)
+ * Port to Pandas 2.0
+
+ -- Andreas Tille <tille at debian.org> Sun, 18 Feb 2024 17:28:32 +0100
+
q2-sample-classifier (2023.9.0-1) unstable; urgency=medium
* Team upload.
=====================================
debian/control
=====================================
@@ -8,7 +8,7 @@ Section: science
Priority: optional
Build-Depends: debhelper-compat (= 13),
dh-sequence-python3,
- qiime (>= 2023.9),
+ qiime (>= 2024.2),
python3-all,
python3-setuptools,
python3-pytest <!nocheck>
@@ -25,9 +25,9 @@ Depends: ${shlibs:Depends},
${python3:Depends},
python3-distutils,
python3-sklearn,
- qiime (>= 2023.9),
- q2-types (>= 2023.9),
- q2-feature-table (>= 2023.9)
+ qiime (>= 2024.2),
+ q2-types (>= 2024.2),
+ q2-feature-table (>= 2024.2)
Description: QIIME 2 plugin for machine learning prediction of sample data
QIIME 2 is a powerful, extensible, and decentralized microbiome analysis
package with a focus on data and analysis transparency. QIIME 2 enables
=====================================
debian/patches/fix-autopkgtest.patch
=====================================
@@ -2,9 +2,9 @@ Description: Fix autopkgtest errors that were failing due to sklearn changed API
assignment of multi-dimensiondal array to pandas
Author: Mohammed Bilal <mdbilal at disroot.org>
Last-Update: 2022-09-09
---- q2-sample-classifier.orig/q2_sample_classifier/utilities.py
-+++ q2-sample-classifier/q2_sample_classifier/utilities.py
-@@ -257,7 +257,7 @@
+--- a/q2_sample_classifier/utilities.py
++++ b/q2_sample_classifier/utilities.py
+@@ -258,7 +258,7 @@ def _extract_rfe_scores(rfecv):
for n in range(len(rfecv.grid_scores_)-1, -1, -1)]
if x[0] < 1:
x[0] = 1
@@ -13,7 +13,7 @@ Last-Update: 2022-09-09
def nested_cross_validation(table, metadata, cv, random_state, n_jobs,
-@@ -516,13 +516,13 @@
+@@ -523,13 +523,13 @@ def _extract_estimator_parameters(estima
# (drop pipeline params and individual base estimators)
estimator_params = {k: v for k, v in estimator.get_params().items() if
k.startswith('est__') and k != 'est__base_estimator'}
@@ -29,7 +29,7 @@ Last-Update: 2022-09-09
rfep.savefig(join(output_dir, 'rfe_plot.png'))
rfep.savefig(join(output_dir, 'rfe_plot.pdf'))
plt.close('all')
-@@ -821,7 +821,7 @@
+@@ -828,7 +828,7 @@ def _train_adaboost_base_estimator(table
return Pipeline(
[('dv', estimator.named_steps.dv),
('est', adaboost_estimator(estimator.named_steps.est,
=====================================
debian/patches/pandas2.0.patch
=====================================
@@ -0,0 +1,47 @@
+Description: Port to Pandas 2.0
+Author: Andreas Tille <tille at debian.org>
+Last-Update: Sun, 18 Feb 2024 15:54:19 +0100
+
+--- a/q2_sample_classifier/tests/test_estimators.py
++++ b/q2_sample_classifier/tests/test_estimators.py
+@@ -117,7 +117,7 @@ class EstimatorsTests(SampleClassifierTe
+ index_col=0, names=['feature', 'importance'])
+ self.exp_pred = pd.read_csv(
+ self.get_data_path('predictions.tsv'), sep='\t', header=0,
+- index_col=0, squeeze=True)
++ index_col=0).squeeze('columns')
+ index = pd.Index(['A', 'B', 'C', 'D'], name='id')
+ self.table_percnorm = qiime2.Artifact.import_data(
+ FeatureTable[PercentileNormalized], pd.DataFrame(
+--- a/q2_sample_classifier/tests/test_types_formats_transformers.py
++++ b/q2_sample_classifier/tests/test_types_formats_transformers.py
+@@ -84,8 +84,7 @@ class TestSemanticTypes(SampleClassifier
+ exp = pd.Series([True, False, True, False, True, False],
+ name='outlier', index=exp_index)
+ obs = transformer(exp)
+- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+- squeeze=True)
++ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0).squeeze('columns')
+ self.assertEqual(sorted(exp), sorted(obs))
+
+ def test_boolean_format_to_pd_series(self):
+@@ -151,8 +150,7 @@ class TestSemanticTypes(SampleClassifier
+ exp = pd.Series([1, 2, 3, 4],
+ name='prediction', index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+- squeeze=True)
++ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0).squeeze('columns')
+ pdt.assert_series_equal(obs, exp)
+
+ def test_pd_series_to_Predictions_format_allow_nans(self):
+@@ -160,8 +158,7 @@ class TestSemanticTypes(SampleClassifier
+ exp = pd.Series([1, np.nan, 3, np.nan],
+ name='prediction', index=['a', 'b', 'c', 'd'])
+ obs = transformer(exp)
+- obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0,
+- squeeze=True)
++ obs = pd.read_csv(str(obs), sep='\t', header=0, index_col=0).squeeze('columns')
+ pdt.assert_series_equal(obs, exp)
+
+ def test_Predictions_format_to_pd_series(self):
=====================================
debian/patches/series
=====================================
@@ -4,3 +4,4 @@ fix-autopkgtest.patch
sklearn-1.2.1.patch
convert-estimator.patch
python3.12.patch
+pandas2.0.patch
=====================================
debian/patches/sklearn-1.2.1.patch
=====================================
@@ -7,9 +7,9 @@ Forwarded: https://github.com/qiime2/q2-sample-classifier/issues/227
Last-Update: 2023-02-02
---
This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
---- q2-sample-classifier.orig/q2_sample_classifier/tests/test_estimators.py
-+++ q2-sample-classifier/q2_sample_classifier/tests/test_estimators.py
-@@ -135,7 +135,7 @@
+--- a/q2_sample_classifier/tests/test_estimators.py
++++ b/q2_sample_classifier/tests/test_estimators.py
+@@ -135,7 +135,7 @@ class EstimatorsTests(SampleClassifierTe
dv = DictVectorizer()
dv.fit(dicts)
features = table.ids('observation')
@@ -18,9 +18,9 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
self.assertEqual(len(dicts), len(table.ids()))
for dict_row, (table_row, _, _) in zip(dicts, table.iter()):
for feature, count in zip(features, table_row):
---- q2-sample-classifier.orig/q2_sample_classifier/utilities.py
-+++ q2-sample-classifier/q2_sample_classifier/utilities.py
-@@ -238,7 +238,7 @@
+--- a/q2_sample_classifier/utilities.py
++++ b/q2_sample_classifier/utilities.py
+@@ -239,7 +239,7 @@ def _rfecv_feature_selection(feature_dat
# Describe top features
n_opt = rfecv.named_steps.est.n_features_
importance = _extract_important_features(
@@ -29,7 +29,7 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
rfecv.named_steps.est.ranking_)
importance = sort_importances(importance, ascending=True)[:n_opt]
-@@ -252,9 +252,10 @@
+@@ -253,9 +253,10 @@ def _extract_rfe_scores(rfecv):
# If using fractional step, step = integer of fraction * n_features
if rfecv.step < 1:
rfecv.step = int(rfecv.step * n_features)
@@ -42,7 +42,7 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
if x[0] < 1:
x[0] = 1
return pd.Series(rfecv.cv_results_['mean_test_score'], index=x, name='Accuracy')
-@@ -404,12 +405,12 @@
+@@ -411,12 +412,12 @@ def _calculate_feature_importances(estim
# feature_importances_ or coef_ to report feature importance/weights
try:
importances = _extract_important_features(
@@ -57,7 +57,7 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/
estimator.named_steps.est.coef_)
return importances
-@@ -711,7 +712,7 @@
+@@ -718,7 +719,7 @@ def _mean_feature_importance(importances
def _null_feature_importance(table):
feature_extractor = DictVectorizer()
feature_extractor.fit(table)
=====================================
debian/tests/control
=====================================
@@ -1,3 +1,3 @@
Tests: run-unit-test
-Depends: @, python3-pytest-cov
+Depends: @, python3-pytest-cov, python3-all
Restrictions: allow-stderr, skip-not-installable
=====================================
debian/tests/run-unit-test
=====================================
@@ -17,4 +17,7 @@ if [ ! -f /usr/lib/python3/dist-packages/pytest_cov/__init__.py ] ; then
fi
# Run build-time tests
-py.test-3 --cov=${pkg}
+for py in $(py3versions -s 2> /dev/null)
+do
+ ${py} -m pytest -v --cov=${pkg}
+done
=====================================
q2_sample_classifier/_transformer.py
=====================================
@@ -136,7 +136,26 @@ def _a(dirfmt: SampleEstimatorDirFmt) -> Pipeline:
with tarfile.open(str(sklearn_pipeline)) as tar:
tmpdir = model.DirectoryFormat()
dirname = str(tmpdir)
- tar.extractall(dirname)
+
+ def is_within_directory(directory, target):
+
+ abs_directory = os.path.abspath(directory)
+ abs_target = os.path.abspath(target)
+
+ prefix = os.path.commonprefix([abs_directory, abs_target])
+
+ return prefix == abs_directory
+
+ def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
+
+ for member in tar.getmembers():
+ member_path = os.path.join(path, member.name)
+ if not is_within_directory(path, member_path):
+ raise Exception("Attempted Path Traversal in Tar File")
+
+ tar.extractall(path, members, numeric_owner=numeric_owner)
+
+ safe_extract(tar, dirname)
pipeline = joblib.load(os.path.join(dirname, 'sklearn_pipeline.pkl'))
for fn in tar.getnames():
os.unlink(os.path.join(dirname, fn))
=====================================
q2_sample_classifier/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 2023.9.0, Release-2023.9)"
- git_full = "8c6fb31849f929d00ae6b7a5b6b92fd1cfebb10b"
- git_date = "2023-10-03 22:04:15 +0000"
+ git_refnames = " (tag: 2024.2.0, Release-2024.2)"
+ git_full = "e32969bfe9c0e177ca0d5cfba270216c98bbbd9e"
+ git_date = "2024-02-16 21:57:23 +0000"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
q2_sample_classifier/classify.py
=====================================
@@ -15,6 +15,7 @@ from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import qiime2
+from qiime2.plugin import get_available_cores
import pandas as pd
import biom
import skbio
@@ -107,6 +108,9 @@ def _fit_predict_knn_cv(
x: pd.DataFrame, y: pd.Series, k: int, cv: int,
random_state: int, n_jobs: int
) -> (pd.Series, pd.Series):
+ if n_jobs == 0:
+ n_jobs = get_available_cores()
+
kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
# train and test with CV
@@ -291,6 +295,9 @@ def fit_regressor(table: biom.Table,
def predict_base(table, sample_estimator, n_jobs):
+ if n_jobs == 0:
+ n_jobs = get_available_cores()
+
# extract feature data from biom
feature_data = _extract_features(table)
index = table.ids()
=====================================
q2_sample_classifier/plugin_setup.py
=====================================
@@ -10,10 +10,10 @@ import importlib
from qiime2.plugin import (
Int, Str, Float, Range, Bool, Plugin, Metadata, Choices, MetadataColumn,
- Numeric, Categorical, Citations, Visualization, TypeMatch)
+ Numeric, Categorical, Citations, Visualization, TypeMatch, Threads)
from q2_types.feature_table import (
FeatureTable, Frequency, RelativeFrequency, PresenceAbsence, Balance,
- PercentileNormalized, Design)
+ PercentileNormalized, Design, Composition)
from q2_types.sample_data import SampleData
from q2_types.feature_data import FeatureData
from q2_types.distance_matrix import DistanceMatrix
@@ -89,7 +89,8 @@ predict_description = (
'contain overlapping features with the feature table used to train '
'the estimator.')
-inputs = {'table': FeatureTable[Frequency]}
+inputs = {'table': FeatureTable[
+ Frequency | RelativeFrequency | PresenceAbsence | Composition]}
input_descriptions = {'table': 'Feature table containing all features that '
'should be used for target prediction.',
@@ -99,7 +100,7 @@ input_descriptions = {'table': 'Feature table containing all features that '
parameters = {
'base': {
'random_state': Int,
- 'n_jobs': Int,
+ 'n_jobs': Threads,
'n_estimators': Int % Range(1, None),
'missing_samples': Str % Choices(['error', 'ignore'])},
'splitter': {
@@ -492,7 +493,7 @@ plugin.visualizers.register_function(
T = TypeMatch([Frequency, RelativeFrequency, PresenceAbsence, Balance,
- PercentileNormalized, Design])
+ PercentileNormalized, Design, Composition])
plugin.methods.register_function(
function=split_table,
inputs={'table': FeatureTable[T]},
=====================================
q2_sample_classifier/tests/test_types_formats_transformers.py
=====================================
@@ -421,7 +421,28 @@ class TestTransformers(SampleEstimatorTestBase):
def read_pipeline(pipeline_filepath):
with tarfile.open(pipeline_filepath) as tar:
dirname = tempfile.mkdtemp()
- tar.extractall(dirname)
+
+ def is_within_directory(directory, target):
+
+ abs_directory = os.path.abspath(directory)
+ abs_target = os.path.abspath(target)
+
+ prefix = os.path.commonprefix([abs_directory, abs_target])
+
+ return prefix == abs_directory
+
+ def safe_extract(tar, path=".", members=None, *,
+ numeric_owner=False):
+
+ for member in tar.getmembers():
+ member_path = os.path.join(path, member.name)
+ if not is_within_directory(path, member_path):
+ raise Exception("Attempted Path Traversal in Tar"
+ "File")
+
+ tar.extractall(path, members, numeric_owner=numeric_owner)
+
+ safe_extract(tar, dirname)
pipeline = joblib.load(os.path.join(dirname,
'sklearn_pipeline.pkl'))
for fn in tar.getnames():
=====================================
q2_sample_classifier/utilities.py
=====================================
@@ -27,6 +27,7 @@ from sklearn.tree import (
)
from sklearn.pipeline import Pipeline
+from qiime2.plugin import get_available_cores
import q2templates
import pandas as pd
import numpy as np
@@ -264,6 +265,9 @@ def nested_cross_validation(table, metadata, cv, random_state, n_jobs,
n_estimators, estimator, stratify,
parameter_tuning, classification, scoring,
missing_samples='error'):
+ if n_jobs == 0:
+ n_jobs = get_available_cores()
+
# extract column name from NumericMetadataColumn
column = metadata.name
@@ -301,6 +305,9 @@ def _fit_estimator(features, targets, estimator, n_estimators=100, step=0.05,
cv=5, random_state=None, n_jobs=1,
optimize_feature_selection=False, parameter_tuning=False,
missing_samples='error', classification=True):
+ if n_jobs == 0:
+ n_jobs = get_available_cores()
+
# extract column name from CategoricalMetadataColumn
column = targets.to_series().name
View it on GitLab: https://salsa.debian.org/med-team/q2-sample-classifier/-/compare/3f43f1480d15ba84395a69c2b029f8a54ca0ba60...11dcb446806e7c9a4050806410a2dcad3e9e2b81
--
View it on GitLab: https://salsa.debian.org/med-team/q2-sample-classifier/-/compare/3f43f1480d15ba84395a69c2b029f8a54ca0ba60...11dcb446806e7c9a4050806410a2dcad3e9e2b81
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20240218/82cf0082/attachment-0001.htm>
More information about the debian-med-commit
mailing list