[med-svn] [Git][med-team/sourmash][master] skip branchwater, refresh sourmash.h
Michael R. Crusoe (@crusoe)
gitlab at salsa.debian.org
Mon Oct 28 17:01:43 GMT 2024
Michael R. Crusoe pushed to branch master at Debian Med / sourmash
Commits:
1e81fd74 by Michael R. Crusoe at 2024-10-28T18:01:22+01:00
skip branchwater, refresh sourmash.h
- - - - -
3 changed files:
- debian/patches/series
- − debian/patches/skip-RevIndex-test
- debian/patches/skip-branchwater-feature
Changes:
=====================================
debian/patches/series
=====================================
@@ -1,3 +1,2 @@
skip-branchwater-feature
soften-deps
-skip-RevIndex-test
=====================================
debian/patches/skip-RevIndex-test deleted
=====================================
@@ -1,150 +0,0 @@
-Author: Michael R. Crusoe <crusoe at debian.org>
-Description: Temporarirly skip the RevIndex tests while we debug that
-Forwarded: not-needed
---- sourmash.orig/tests/test_index.py
-+++ sourmash/tests/test_index.py
-@@ -20,7 +20,6 @@
- StandaloneManifestIndex,
- )
- from sourmash.signature import load_one_signature_from_json, save_signatures_to_json
--from sourmash.index.revindex import RevIndex
- from sourmash.sbt import SBT, GraphFactory
- from sourmash import sourmash_args
- from sourmash.search import JaccardSearch, SearchType
-@@ -1812,108 +1811,6 @@
- assert ss_tup == ss_lazy_tup
-
-
--def test_revindex_index_search():
-- # confirm that RevIndex works
-- sig2 = utils.get_test_data("2.fa.sig")
-- sig47 = utils.get_test_data("47.fa.sig")
-- sig63 = utils.get_test_data("63.fa.sig")
--
-- ss2 = load_one_signature_from_json(sig2, ksize=31)
-- ss47 = load_one_signature_from_json(sig47)
-- ss63 = load_one_signature_from_json(sig63)
--
-- lidx = RevIndex(template=ss2.minhash)
-- lidx.insert(ss2)
-- lidx.insert(ss47)
-- lidx.insert(ss63)
--
-- # now, search for sig2
-- sr = lidx.search(ss2, threshold=1.0)
-- print([s[1].name for s in sr])
-- assert len(sr) == 1
-- assert sr[0][1] == ss2
--
-- # search for sig47 with lower threshold; search order not guaranteed.
-- sr = lidx.search(ss47, threshold=0.1)
-- print([s[1].name for s in sr])
-- assert len(sr) == 2
-- sr.sort(key=lambda x: -x[0])
-- assert sr[0][1] == ss47
-- assert sr[1][1] == ss63
--
-- # search for sig63 with lower threshold; search order not guaranteed.
-- sr = lidx.search(ss63, threshold=0.1)
-- print([s[1].name for s in sr])
-- assert len(sr) == 2
-- sr.sort(key=lambda x: -x[0])
-- assert sr[0][1] == ss63
-- assert sr[1][1] == ss47
--
-- # search for sig63 with high threshold => 1 match
-- sr = lidx.search(ss63, threshold=0.8)
-- print([s[1].name for s in sr])
-- assert len(sr) == 1
-- sr.sort(key=lambda x: -x[0])
-- assert sr[0][1] == ss63
--
--
--def test_revindex_gather():
-- # check that RevIndex.best_containment works.
-- sig2 = utils.get_test_data("2.fa.sig")
-- sig47 = utils.get_test_data("47.fa.sig")
-- sig63 = utils.get_test_data("63.fa.sig")
--
-- ss2 = load_one_signature_from_json(sig2, ksize=31)
-- ss47 = load_one_signature_from_json(sig47)
-- ss63 = load_one_signature_from_json(sig63)
--
-- lidx = RevIndex(template=ss2.minhash)
-- lidx.insert(ss2)
-- lidx.insert(ss47)
-- lidx.insert(ss63)
--
-- match = lidx.best_containment(ss2)
-- assert match
-- assert match.score == 1.0
-- assert match.signature == ss2
--
-- match = lidx.best_containment(ss47)
-- assert match
-- assert match.score == 1.0
-- assert match.signature == ss47
--
--
--def test_revindex_gather_ignore():
-- # check that RevIndex gather ignores things properly.
-- sig2 = utils.get_test_data("2.fa.sig")
-- sig47 = utils.get_test_data("47.fa.sig")
-- sig63 = utils.get_test_data("63.fa.sig")
--
-- ss2 = load_one_signature_from_json(sig2, ksize=31)
-- ss47 = load_one_signature_from_json(sig47, ksize=31)
-- ss63 = load_one_signature_from_json(sig63, ksize=31)
--
-- # construct an index...
-- lidx = RevIndex(template=ss2.minhash, signatures=[ss2, ss47, ss63])
--
-- # ...now search with something that should ignore sig47, the exact match.
-- search_fn = JaccardSearchBestOnly_ButIgnore([ss47])
--
-- results = list(lidx.find(search_fn, ss47))
-- results = [ss.signature for ss in results]
--
-- def is_found(ss, xx):
-- for q in xx:
-- print(ss, ss.similarity(q))
-- if ss.similarity(q) == 1.0:
-- return True
-- return False
--
-- assert not is_found(ss47, results)
-- assert not is_found(ss2, results)
-- assert is_found(ss63, results)
--
--
- def test_standalone_manifest_signatures(runtmp):
- # build a StandaloneManifestIndex and test 'signatures' method.
-
---- sourmash.orig/tests/test_index_protocol.py
-+++ sourmash/tests/test_index_protocol.py
-@@ -18,7 +18,6 @@
- )
- from sourmash.index import CounterGather
- from sourmash.index.sqlite_index import SqliteIndex
--from sourmash.index.revindex import RevIndex
- from sourmash.sbt import SBT, GraphFactory
- from sourmash.manifest import CollectionManifest, BaseCollectionManifest
- from sourmash.lca.lca_db import LCA_Database, load_single_database
-@@ -147,17 +146,6 @@
- return db
-
-
--def build_revindex(runtmp):
-- ss2, ss47, ss63 = _load_three_sigs()
--
-- lidx = RevIndex(template=ss2.minhash)
-- lidx.insert(ss2)
-- lidx.insert(ss47)
-- lidx.insert(ss63)
--
-- return lidx
--
--
- def build_lca_index_save_load_sql(runtmp):
- db = build_lca_index(runtmp)
- outfile = runtmp.output("db.lca.json")
=====================================
debian/patches/skip-branchwater-feature
=====================================
@@ -46,3 +46,732 @@ Forwarded: not-needed
+#rocksdb = { version = "0.21.0", optional = true }
[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies]
criterion = "0.5.1"
+--- sourmash.orig/include/sourmash.h
++++ sourmash/include/sourmash.h
+@@ -56,8 +56,6 @@
+
+ typedef struct SourmashNodegraph SourmashNodegraph;
+
+-typedef struct SourmashRevIndex SourmashRevIndex;
+-
+ typedef struct SourmashSearchResult SourmashSearchResult;
+
+ typedef struct SourmashSignature SourmashSignature;
+@@ -312,43 +310,6 @@
+ uintptr_t starting_size,
+ uintptr_t n_tables);
+
+-void revindex_free(SourmashRevIndex *ptr);
+-
+-const SourmashSearchResult *const *revindex_gather(const SourmashRevIndex *ptr,
+- const SourmashSignature *sig_ptr,
+- double threshold,
+- bool _do_containment,
+- bool _ignore_abundance,
+- uintptr_t *size);
+-
+-uint64_t revindex_len(const SourmashRevIndex *ptr);
+-
+-SourmashRevIndex *revindex_new_with_paths(const SourmashStr *const *search_sigs_ptr,
+- uintptr_t insigs,
+- const SourmashKmerMinHash *template_ptr,
+- uintptr_t threshold,
+- const SourmashKmerMinHash *const *queries_ptr,
+- uintptr_t inqueries,
+- bool keep_sigs);
+-
+-SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_sigs_ptr,
+- uintptr_t insigs,
+- const SourmashKmerMinHash *template_ptr,
+- uintptr_t threshold,
+- const SourmashKmerMinHash *const *queries_ptr,
+- uintptr_t inqueries);
+-
+-uint64_t revindex_scaled(const SourmashRevIndex *ptr);
+-
+-const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr,
+- const SourmashSignature *sig_ptr,
+- double threshold,
+- bool do_containment,
+- bool _ignore_abundance,
+- uintptr_t *size);
+-
+-SourmashSignature **revindex_signatures(const SourmashRevIndex *ptr, uintptr_t *size);
+-
+ SourmashStr searchresult_filename(const SourmashSearchResult *ptr);
+
+ void searchresult_free(SourmashSearchResult *ptr);
+--- sourmash.orig/src/core/cbindgen.toml
++++ sourmash/src/core/cbindgen.toml
+@@ -8,7 +8,6 @@
+
+ [parse.expand]
+ crates = ["sourmash"]
+-features = ["branchwater"]
+
+ [enum]
+ rename_variants = "QualifiedScreamingSnakeCase"
+--- sourmash.orig/src/core/src/ffi/index/revindex.rs
++++ sourmash/src/core/src/ffi/index/revindex.rs
+@@ -1,271 +0,0 @@
+-use std::slice;
+-
+-use camino::Utf8PathBuf as PathBuf;
+-
+-use crate::ffi::index::SourmashSearchResult;
+-use crate::ffi::minhash::SourmashKmerMinHash;
+-use crate::ffi::signature::SourmashSignature;
+-use crate::ffi::utils::{ForeignObject, SourmashStr};
+-use crate::index::revindex::mem_revindex::RevIndex;
+-use crate::index::Index;
+-use crate::prelude::*;
+-use crate::signature::{Signature, SigsTrait};
+-use crate::sketch::minhash::KmerMinHash;
+-use crate::sketch::Sketch;
+-
+-pub struct SourmashRevIndex;
+-
+-impl ForeignObject for SourmashRevIndex {
+- type RustObject = RevIndex;
+-}
+-
+-// TODO: remove this when it is possible to pass Selection thru the FFI
+-fn from_template(template: &Sketch) -> Selection {
+- let (num, scaled) = match template {
+- Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32),
+- Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32),
+- _ => unimplemented!(),
+- };
+-
+- Selection::builder()
+- .ksize(template.ksize() as u32)
+- .num(num)
+- .scaled(scaled)
+- .build()
+-}
+-
+-ffi_fn! {
+-unsafe fn revindex_new_with_paths(
+- search_sigs_ptr: *const *const SourmashStr,
+- insigs: usize,
+- template_ptr: *const SourmashKmerMinHash,
+- threshold: usize,
+- queries_ptr: *const *const SourmashKmerMinHash,
+- inqueries: usize,
+- keep_sigs: bool,
+-) -> Result<*mut SourmashRevIndex> {
+- let search_sigs: Vec<PathBuf> = {
+- assert!(!search_sigs_ptr.is_null());
+- slice::from_raw_parts(search_sigs_ptr, insigs)
+- .iter()
+- .map(|path| {
+- let mut new_path = PathBuf::new();
+- new_path.push(SourmashStr::as_rust(*path).as_str());
+- new_path
+- })
+- .collect()
+- };
+-
+- let template = {
+- assert!(!template_ptr.is_null());
+- //TODO: avoid clone here
+- Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone())
+- };
+-
+- let queries_vec: Vec<KmerMinHash>;
+- let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() {
+- None
+- } else {
+- queries_vec = slice::from_raw_parts(queries_ptr, inqueries)
+- .iter()
+- .map(|mh_ptr|
+- // TODO: avoid this clone
+- SourmashKmerMinHash::as_rust(*mh_ptr).clone())
+- .collect();
+- Some(queries_vec.as_ref())
+- };
+-
+- let selection = from_template(&template);
+-
+- let revindex = RevIndex::new(
+- search_sigs.as_ref(),
+- &selection,
+- threshold,
+- queries,
+- keep_sigs,
+- )?;
+- Ok(SourmashRevIndex::from_rust(revindex))
+-}
+-}
+-
+-ffi_fn! {
+-unsafe fn revindex_new_with_sigs(
+- search_sigs_ptr: *const *const SourmashSignature,
+- insigs: usize,
+- template_ptr: *const SourmashKmerMinHash,
+- threshold: usize,
+- queries_ptr: *const *const SourmashKmerMinHash,
+- inqueries: usize,
+-) -> Result<*mut SourmashRevIndex> {
+- let search_sigs: Vec<Signature> = {
+- assert!(!search_sigs_ptr.is_null());
+- slice::from_raw_parts(search_sigs_ptr, insigs)
+- .iter()
+- .map(|sig| SourmashSignature::as_rust(*sig))
+- .cloned()
+- .collect()
+- };
+-
+- let template = {
+- assert!(!template_ptr.is_null());
+- //TODO: avoid clone here
+- Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone())
+- };
+-
+- let queries_vec: Vec<KmerMinHash>;
+- let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() {
+- None
+- } else {
+- queries_vec = slice::from_raw_parts(queries_ptr, inqueries)
+- .iter()
+- .map(|mh_ptr|
+- // TODO: avoid this clone
+- SourmashKmerMinHash::as_rust(*mh_ptr).clone())
+- .collect();
+- Some(queries_vec.as_ref())
+- };
+-
+- let selection = from_template(&template);
+- let revindex = RevIndex::new_with_sigs(search_sigs, &selection, threshold, queries)?;
+- Ok(SourmashRevIndex::from_rust(revindex))
+-}
+-}
+-
+-#[no_mangle]
+-pub unsafe extern "C" fn revindex_free(ptr: *mut SourmashRevIndex) {
+- SourmashRevIndex::drop(ptr);
+-}
+-
+-ffi_fn! {
+-unsafe fn revindex_search(
+- ptr: *const SourmashRevIndex,
+- sig_ptr: *const SourmashSignature,
+- threshold: f64,
+- do_containment: bool,
+- _ignore_abundance: bool,
+- size: *mut usize,
+-) -> Result<*const *const SourmashSearchResult> {
+- let revindex = SourmashRevIndex::as_rust(ptr);
+- let sig = SourmashSignature::as_rust(sig_ptr);
+-
+- if sig.signatures.is_empty() {
+- *size = 0;
+- return Ok(std::ptr::null::<*const SourmashSearchResult>());
+- }
+-
+- let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] {
+- mh
+- } else {
+- // TODO: what if it is not a mh?
+- unimplemented!()
+- };
+-
+- let results: Vec<(f64, Signature, String)> = revindex
+- .find_signatures(mh, threshold, do_containment, true)?
+- .into_iter()
+- .collect();
+-
+- // FIXME: use the ForeignObject trait, maybe define new method there...
+- let ptr_sigs: Vec<*const SourmashSearchResult> = results
+- .into_iter()
+- .map(|x| Box::into_raw(Box::new(x)) as *const SourmashSearchResult)
+- .collect();
+-
+- let b = ptr_sigs.into_boxed_slice();
+- *size = b.len();
+-
+- Ok(Box::into_raw(b) as *const *const SourmashSearchResult)
+-}
+-}
+-
+-ffi_fn! {
+-unsafe fn revindex_gather(
+- ptr: *const SourmashRevIndex,
+- sig_ptr: *const SourmashSignature,
+- threshold: f64,
+- _do_containment: bool,
+- _ignore_abundance: bool,
+- size: *mut usize,
+-) -> Result<*const *const SourmashSearchResult> {
+- let revindex = SourmashRevIndex::as_rust(ptr);
+- let sig = SourmashSignature::as_rust(sig_ptr);
+-
+- if sig.signatures.is_empty() {
+- *size = 0;
+- return Ok(std::ptr::null::<*const SourmashSearchResult>());
+- }
+-
+- let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] {
+- mh
+- } else {
+- // TODO: what if it is not a mh?
+- unimplemented!()
+- };
+-
+- // TODO: proper threshold calculation
+- let threshold: usize = (threshold * (mh.size() as f64)) as _;
+-
+- let counter = revindex.counter_for_query(mh);
+- dbg!(&counter);
+-
+- let results: Vec<(f64, Signature, String)> = revindex
+- .gather(counter, threshold, mh)
+- .unwrap() // TODO: proper error handling
+- .into_iter()
+- .map(|r| {
+- let filename = r.filename().to_owned();
+- let sig = r.get_match();
+- (r.f_match(), sig, filename)
+- })
+- .collect();
+-
+- // FIXME: use the ForeignObject trait, maybe define new method there...
+- let ptr_sigs: Vec<*const SourmashSearchResult> = results
+- .into_iter()
+- .map(|x| Box::into_raw(Box::new(x)) as *const SourmashSearchResult)
+- .collect();
+-
+- let b = ptr_sigs.into_boxed_slice();
+- *size = b.len();
+-
+- Ok(Box::into_raw(b) as *const *const SourmashSearchResult)
+-}
+-}
+-
+-#[no_mangle]
+-pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 {
+- let revindex = SourmashRevIndex::as_rust(ptr);
+- if let Sketch::MinHash(mh) = revindex.template() {
+- mh.scaled()
+- } else {
+- unimplemented!()
+- }
+-}
+-
+-#[no_mangle]
+-pub unsafe extern "C" fn revindex_len(ptr: *const SourmashRevIndex) -> u64 {
+- let revindex = SourmashRevIndex::as_rust(ptr);
+- revindex.len() as u64
+-}
+-
+-ffi_fn! {
+-unsafe fn revindex_signatures(
+- ptr: *const SourmashRevIndex,
+- size: *mut usize,
+-) -> Result<*mut *mut SourmashSignature> {
+- let revindex = SourmashRevIndex::as_rust(ptr);
+-
+- let sigs = revindex.signatures();
+-
+- // FIXME: use the ForeignObject trait, maybe define new method there...
+- let ptr_sigs: Vec<*mut SourmashSignature> = sigs
+- .into_iter()
+- .map(|x| Box::into_raw(Box::new(x)) as *mut SourmashSignature)
+- .collect();
+-
+- let b = ptr_sigs.into_boxed_slice();
+- *size = b.len();
+-
+- Ok(Box::into_raw(b) as *mut *mut SourmashSignature)
+-}
+-}
+--- sourmash.orig/src/sourmash/index/revindex.py
++++ sourmash/src/sourmash/index/revindex.py
+@@ -1,240 +1,3 @@
+-"""
+-RevIndex - a rust-based reverse index by hashes.
+-"""
+-
+-import weakref
+-
+-from sourmash.index import Index, IndexSearchResult
+-from sourmash.minhash import MinHash
+-from sourmash.signature import SourmashSignature
+-from sourmash._lowlevel import ffi, lib
+-from sourmash.utils import RustObject, rustcall, decode_str, encode_str
+-
+-
+-class RevIndex(RustObject, Index):
+- __dealloc_func__ = lib.revindex_free
+-
+- def __init__(
+- self,
+- *,
+- signatures=None,
+- signature_paths=None,
+- template=None,
+- threshold=0,
+- queries=None,
+- keep_sigs=False,
+- ):
+- self.template = template
+- self.threshold = threshold
+- self.queries = queries
+- self.keep_sigs = keep_sigs
+- self.signature_paths = signature_paths
+- self._signatures = signatures
+-
+- if signature_paths is None or signatures is None:
+- # delay initialization
+- self._objptr = ffi.NULL
+- else:
+- self._init_inner()
+-
+- def _init_inner(self):
+- if self._objptr != ffi.NULL:
+- # Already initialized
+- return
+-
+- if (
+- self.signature_paths is None
+- and not self._signatures
+- and self._objptr == ffi.NULL
+- ):
+- raise ValueError("No signatures provided")
+- elif (self.signature_paths or self._signatures) and self._objptr != ffi.NULL:
+- raise NotImplementedError("Need to update RevIndex")
+-
+- attached_refs = weakref.WeakKeyDictionary()
+-
+- queries_ptr = ffi.NULL
+- queries_size = 0
+- if self.queries:
+- # get list of rust objects
+- collected = []
+- for obj in queries:
+- rv = obj._get_objptr()
+- attached_refs[rv] = obj
+- collected.append(rv)
+- queries_ptr = ffi.new("SourmashSignature*[]", collected)
+- queries_size = len(queries)
+-
+- template_ptr = ffi.NULL
+- if self.template:
+- if isinstance(self.template, MinHash):
+- template_ptr = self.template._get_objptr()
+- else:
+- raise ValueError("Template must be a MinHash")
+-
+- search_sigs_ptr = ffi.NULL
+- sigs_size = 0
+- collected = []
+- if self.signature_paths:
+- for path in self.signature_paths:
+- collected.append(encode_str(path))
+- search_sigs_ptr = ffi.new("SourmashStr*[]", collected)
+- sigs_size = len(signature_paths)
+-
+- self._objptr = rustcall(
+- lib.revindex_new_with_paths,
+- search_sigs_ptr,
+- sigs_size,
+- template_ptr,
+- self.threshold,
+- queries_ptr,
+- queries_size,
+- self.keep_sigs,
+- )
+- elif self._signatures:
+- # force keep_sigs=True, and pass SourmashSignature directly to RevIndex.
+- for sig in self._signatures:
+- collected.append(sig._get_objptr())
+- search_sigs_ptr = ffi.new("SourmashSignature*[]", collected)
+- sigs_size = len(self._signatures)
+-
+- self._objptr = rustcall(
+- lib.revindex_new_with_sigs,
+- search_sigs_ptr,
+- sigs_size,
+- template_ptr,
+- self.threshold,
+- queries_ptr,
+- queries_size,
+- )
+-
+- def signatures(self):
+- self._init_inner()
+-
+- size = ffi.new("uintptr_t *")
+- sigs_ptr = self._methodcall(lib.revindex_signatures, size)
+- size = size[0]
+-
+- sigs = []
+- for i in range(size):
+- sig = SourmashSignature._from_objptr(sigs_ptr[i])
+- sigs.append(sig)
+-
+- for sig in sigs:
+- yield sig
+-
+- # if self._signatures:
+- # yield from self._signatures
+- # else:
+- # raise NotImplementedError("Call into Rust and retrieve sigs")
+-
+- def __len__(self):
+- if self._objptr:
+- return self._methodcall(lib.revindex_len)
+- else:
+- return len(self._signatures)
+-
+- def insert(self, node):
+- if self._signatures is None:
+- self._signatures = []
+- self._signatures.append(node)
+-
+- def save(self, path):
+- pass
+-
+- @classmethod
+- def load(cls, location):
+- pass
+-
+- def select(self, ksize=None, moltype=None, **kwargs):
+- if self.template:
+- if ksize:
+- self.template.ksize = ksize
+- if moltype:
+- self.template.moltype = moltype
+- else:
+- # TODO: deal with None/default values
+- self.template = MinHash(ksize=ksize, moltype=moltype)
+-
+- # def search(self, query, *args, **kwargs):
+- # """Return set of matches with similarity above 'threshold'.
+- #
+- # Results will be sorted by similarity, highest to lowest.
+- #
+- # Optional arguments:
+- # * do_containment: default False. If True, use Jaccard containment.
+- # * ignore_abundance: default False. If True, and query signature
+- # and database support k-mer abundances, ignore those abundances.
+- #
+- # Note, the "best only" hint is ignored by LCA_Database
+- # """
+- # if not query.minhash:
+- # return []
+- #
+- # # check arguments
+- # if "threshold" not in kwargs:
+- # raise TypeError("'search' requires 'threshold'")
+- # threshold = kwargs["threshold"]
+- # do_containment = kwargs.get("do_containment", False)
+- # ignore_abundance = kwargs.get("ignore_abundance", False)
+- #
+- # self._init_inner()
+- #
+- # size = ffi.new("uintptr_t *")
+- # results_ptr = self._methodcall(
+- # lib.revindex_search,
+- # query._get_objptr(),
+- # threshold,
+- # do_containment,
+- # ignore_abundance,
+- # size,
+- # )
+- #
+- # size = size[0]
+- # if size == 0:
+- # return []
+- #
+- # results = []
+- # for i in range(size):
+- # match = SearchResult._from_objptr(results_ptr[i])
+- # if match.score >= threshold:
+- # results.append(IndexSearchResult(match.score, match.signature, match.filename))
+- #
+- # return results
+- #
+- # def gather(self, query, *args, **kwargs):
+- # "Return the match with the best Jaccard containment in the database."
+- # if not query.minhash:
+- # return []
+- #
+- # self._init_inner()
+- #
+- # threshold_bp = kwargs.get("threshold_bp", 0.0)
+- # threshold = threshold_bp / (len(query.minhash) * self.scaled)
+- #
+- # results = []
+- # size = ffi.new("uintptr_t *")
+- # results_ptr = self._methodcall(
+- # lib.revindex_gather, query._get_objptr(), threshold, True, True, size
+- # )
+- # size = size[0]
+- # if size == 0:
+- # return []
+- #
+- # results = []
+- # for i in range(size):
+- # match = SearchResult._from_objptr(results_ptr[i])
+- # if match.score >= threshold:
+- # results.append(IndexSearchResult(match.score, match.signature, match.filename))
+- #
+- # results.sort(reverse=True,
+- # key=lambda x: (x.score, x.signature.md5sum()))
+- #
+- # return results[:1]
+-
+- @property
+- def scaled(self):
+- return self._methodcall(lib.revindex_scaled)
+
+
+ class SearchResult(RustObject):
+--- sourmash.orig/tests/test_index.py
++++ sourmash/tests/test_index.py
+@@ -20,7 +20,6 @@
+ StandaloneManifestIndex,
+ )
+ from sourmash.signature import load_one_signature_from_json, save_signatures_to_json
+-from sourmash.index.revindex import RevIndex
+ from sourmash.sbt import SBT, GraphFactory
+ from sourmash import sourmash_args
+ from sourmash.search import JaccardSearch, SearchType
+@@ -1812,108 +1811,6 @@
+ assert ss_tup == ss_lazy_tup
+
+
+-def test_revindex_index_search():
+- # confirm that RevIndex works
+- sig2 = utils.get_test_data("2.fa.sig")
+- sig47 = utils.get_test_data("47.fa.sig")
+- sig63 = utils.get_test_data("63.fa.sig")
+-
+- ss2 = load_one_signature_from_json(sig2, ksize=31)
+- ss47 = load_one_signature_from_json(sig47)
+- ss63 = load_one_signature_from_json(sig63)
+-
+- lidx = RevIndex(template=ss2.minhash)
+- lidx.insert(ss2)
+- lidx.insert(ss47)
+- lidx.insert(ss63)
+-
+- # now, search for sig2
+- sr = lidx.search(ss2, threshold=1.0)
+- print([s[1].name for s in sr])
+- assert len(sr) == 1
+- assert sr[0][1] == ss2
+-
+- # search for sig47 with lower threshold; search order not guaranteed.
+- sr = lidx.search(ss47, threshold=0.1)
+- print([s[1].name for s in sr])
+- assert len(sr) == 2
+- sr.sort(key=lambda x: -x[0])
+- assert sr[0][1] == ss47
+- assert sr[1][1] == ss63
+-
+- # search for sig63 with lower threshold; search order not guaranteed.
+- sr = lidx.search(ss63, threshold=0.1)
+- print([s[1].name for s in sr])
+- assert len(sr) == 2
+- sr.sort(key=lambda x: -x[0])
+- assert sr[0][1] == ss63
+- assert sr[1][1] == ss47
+-
+- # search for sig63 with high threshold => 1 match
+- sr = lidx.search(ss63, threshold=0.8)
+- print([s[1].name for s in sr])
+- assert len(sr) == 1
+- sr.sort(key=lambda x: -x[0])
+- assert sr[0][1] == ss63
+-
+-
+-def test_revindex_gather():
+- # check that RevIndex.best_containment works.
+- sig2 = utils.get_test_data("2.fa.sig")
+- sig47 = utils.get_test_data("47.fa.sig")
+- sig63 = utils.get_test_data("63.fa.sig")
+-
+- ss2 = load_one_signature_from_json(sig2, ksize=31)
+- ss47 = load_one_signature_from_json(sig47)
+- ss63 = load_one_signature_from_json(sig63)
+-
+- lidx = RevIndex(template=ss2.minhash)
+- lidx.insert(ss2)
+- lidx.insert(ss47)
+- lidx.insert(ss63)
+-
+- match = lidx.best_containment(ss2)
+- assert match
+- assert match.score == 1.0
+- assert match.signature == ss2
+-
+- match = lidx.best_containment(ss47)
+- assert match
+- assert match.score == 1.0
+- assert match.signature == ss47
+-
+-
+-def test_revindex_gather_ignore():
+- # check that RevIndex gather ignores things properly.
+- sig2 = utils.get_test_data("2.fa.sig")
+- sig47 = utils.get_test_data("47.fa.sig")
+- sig63 = utils.get_test_data("63.fa.sig")
+-
+- ss2 = load_one_signature_from_json(sig2, ksize=31)
+- ss47 = load_one_signature_from_json(sig47, ksize=31)
+- ss63 = load_one_signature_from_json(sig63, ksize=31)
+-
+- # construct an index...
+- lidx = RevIndex(template=ss2.minhash, signatures=[ss2, ss47, ss63])
+-
+- # ...now search with something that should ignore sig47, the exact match.
+- search_fn = JaccardSearchBestOnly_ButIgnore([ss47])
+-
+- results = list(lidx.find(search_fn, ss47))
+- results = [ss.signature for ss in results]
+-
+- def is_found(ss, xx):
+- for q in xx:
+- print(ss, ss.similarity(q))
+- if ss.similarity(q) == 1.0:
+- return True
+- return False
+-
+- assert not is_found(ss47, results)
+- assert not is_found(ss2, results)
+- assert is_found(ss63, results)
+-
+-
+ def test_standalone_manifest_signatures(runtmp):
+ # build a StandaloneManifestIndex and test 'signatures' method.
+
+--- sourmash.orig/tests/test_index_protocol.py
++++ sourmash/tests/test_index_protocol.py
+@@ -18,7 +18,6 @@
+ )
+ from sourmash.index import CounterGather
+ from sourmash.index.sqlite_index import SqliteIndex
+-from sourmash.index.revindex import RevIndex
+ from sourmash.sbt import SBT, GraphFactory
+ from sourmash.manifest import CollectionManifest, BaseCollectionManifest
+ from sourmash.lca.lca_db import LCA_Database, load_single_database
+@@ -147,17 +146,6 @@
+ return db
+
+
+-def build_revindex(runtmp):
+- ss2, ss47, ss63 = _load_three_sigs()
+-
+- lidx = RevIndex(template=ss2.minhash)
+- lidx.insert(ss2)
+- lidx.insert(ss47)
+- lidx.insert(ss63)
+-
+- return lidx
+-
+-
+ def build_lca_index_save_load_sql(runtmp):
+ db = build_lca_index(runtmp)
+ outfile = runtmp.output("db.lca.json")
View it on GitLab: https://salsa.debian.org/med-team/sourmash/-/commit/1e81fd744ffd4e764bbb25648417b9da0701e060
--
View it on GitLab: https://salsa.debian.org/med-team/sourmash/-/commit/1e81fd744ffd4e764bbb25648417b9da0701e060
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20241028/5bf5ceed/attachment-0001.htm>
More information about the debian-med-commit
mailing list