[med-svn] [Git][med-team/bustools][upstream] New upstream version 0.42.0+dfsg
Nilesh Patra (@nilesh)
gitlab at salsa.debian.org
Sat Dec 31 11:21:14 GMT 2022
Nilesh Patra pushed to branch upstream at Debian Med / bustools
Commits:
3f436bca by Nilesh Patra at 2022-12-31T11:11:41+00:00
New upstream version 0.42.0+dfsg
- - - - -
15 changed files:
- − .github/workflows/release.yml
- src/BUSData.cpp
- src/BUSData.h
- src/Common.hpp
- + src/bustools_compress.cpp
- + src/bustools_compress.h
- src/bustools_correct.cpp
- + src/bustools_decompress.cpp
- + src/bustools_decompress.h
- src/bustools_inspect.cpp
- src/bustools_main.cpp
- src/bustools_mash.cpp
- src/bustools_merge.cpp
- src/bustools_sort.cpp
- src/bustools_whitelist.cpp
Changes:
=====================================
.github/workflows/release.yml deleted
=====================================
@@ -1,82 +0,0 @@
-name: Build release
-
-on:
- workflow_dispatch:
- release:
- types: [created]
-
-jobs:
- build-linux:
- name: Build linux
- runs-on: ubuntu-18.04
- env:
- RELEASE_OS: linux
- steps:
- - name: Checkout branch
- uses: actions/checkout at master
- - name: Setup environment
- id: setup
- run: echo ::set-output name=RELEASE_VERSION::${GITHUB_REF##*/}
- - name: Build
- run: |
- docker run --rm dockbuild/centos5:latest > ./dockbuild \
- && chmod +x ./dockbuild \
- && ./dockbuild -a "-e RELEASE_OS=$RELEASE_OS -e RELEASE_VERSION=$RELEASE_VERSION" bash -c "make -f Makefile.release build && make -f Makefile.release compile_release_$RELEASE_OS"
- env:
- RELEASE_VERSION: ${{ steps.setup.outputs.RELEASE_VERSION }}
- - name: Upload to release
- uses: svenstaro/upload-release-action at v1-release
- with:
- repo_token: ${{ secrets.GITHUB_TOKEN }}
- file: release/bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.tar.gz
- asset_name: bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.tar.gz
- tag: ${{ github.ref }}
-
- build-mac:
- name: Build mac
- runs-on: macos-latest
- env:
- RELEASE_OS: mac
- steps:
- - name: Checkout branch
- uses: actions/checkout at master
- - name: Setup environment
- id: setup
- run: echo ::set-output name=RELEASE_VERSION::${GITHUB_REF##*/}
- - name: Build
- run: make -f Makefile.release build && make -f Makefile.release compile_release_$RELEASE_OS
- env:
- RELEASE_VERSION: ${{ steps.setup.outputs.RELEASE_VERSION }}
- - name: Upload to release
- uses: svenstaro/upload-release-action at v1-release
- with:
- repo_token: ${{ secrets.GITHUB_TOKEN }}
- file: release/bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.tar.gz
- asset_name: bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.tar.gz
- tag: ${{ github.ref }}
-
- build-windows:
- name: Build windows
- runs-on: ubuntu-18.04
- env:
- RELEASE_OS: windows
- steps:
- - name: Checkout branch
- uses: actions/checkout at master
- - name: Setup environment
- id: setup
- run: echo ::set-output name=RELEASE_VERSION::${GITHUB_REF##*/}
- - name: Build
- run: |
- docker run --rm dockcross/windows-static-x64:20191127-92bdbca > ./dockcross \
- && chmod +x ./dockcross \
- && ./dockcross -a "-e RELEASE_OS=$RELEASE_OS -e RELEASE_VERSION=$RELEASE_VERSION" bash -c "make -f Makefile.release install_zlib && make -f Makefile.release build && make -f Makefile.release compile_release_$RELEASE_OS"
- env:
- RELEASE_VERSION: ${{ steps.setup.outputs.RELEASE_VERSION }}
- - name: Upload to release
- uses: svenstaro/upload-release-action at v1-release
- with:
- repo_token: ${{ secrets.GITHUB_TOKEN }}
- file: release/bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.zip
- asset_name: bustools_${{ env.RELEASE_OS }}-${{ steps.setup.outputs.RELEASE_VERSION }}.zip
- tag: ${{ github.ref }}
=====================================
src/BUSData.cpp
=====================================
@@ -71,6 +71,36 @@ uint64_t stringToBinary(const char* s, const size_t len, uint32_t &flag) {
return r;
}
+int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHeader &comp_header)
+{
+ int ret = -1;
+ char magic[5];
+
+ magic[4] = '\0';
+
+ inf.read(magic, 4);
+ for (int i = 0; i < 4; ++i)
+ {
+ inf.putback(magic[3 - i]);
+ }
+
+ if (std::strcmp(magic, "BUS\0") == 0)
+ {
+ return BUSFILE_TYPE::BUSFILE * parseHeader(inf, header);
+ }
+ else if (std::strcmp(magic, "BUS\1") == 0)
+ {
+ return BUSFILE_TYPE::BUSFILE_COMPRESED * parseCompressedHeader(inf, comp_header);
+ }
+ else if(std::strcmp(magic, "BZI\0") == 0){
+ return BUSFILE_TYPE::BUSZ_INDEX;
+ }
+ else if (std::strcmp(magic, "BEC\0") == 0)
+ {
+ return BUSFILE_TYPE::EC_MATRIX_COMPRESSED;
+ }
+ return BUSFILE_TYPE::EC_MATRIX;
+}
bool parseHeader(std::istream &inf, BUSHeader &header) {
char magic[4];
@@ -95,7 +125,80 @@ bool parseHeader(std::istream &inf, BUSHeader &header) {
return true;
}
+bool parseCompressedHeader(std::istream &inf, compressed_BUSHeader &compheader)
+{
+ char magic[5];
+ magic[4] = '\0';
+
+ BUSHeader &header = compheader.bus_header;
+ inf.read(magic, 4);
+ if (std::strcmp(magic, "BUS\1") != 0)
+ {
+ std::cerr << "Invalid header magic\n";
+ return false;
+ }
+ inf.read((char *)(&header.version), sizeof(header.version));
+ if (header.version != BUSFORMAT_VERSION)
+ {
+ return false;
+ }
+ inf.read((char *)(&header.bclen), sizeof(header.bclen));
+ inf.read((char *)(&header.umilen), sizeof(header.umilen));
+ uint32_t tlen = 0;
+ inf.read((char *)(&tlen), sizeof(tlen));
+ char *t = new char[tlen + 1];
+ inf.read(t, tlen);
+ t[tlen] = '\0';
+ header.text.assign(t);
+ delete[] t;
+
+ // We store the compressed_header-specific information after the regular header
+ inf.read((char *)&compheader.chunk_size, sizeof(compheader.chunk_size));
+ inf.read((char *)&compheader.pfd_blocksize, sizeof(compheader.pfd_blocksize));
+ inf.read((char *)&compheader.lossy_umi, sizeof(compheader.lossy_umi));
+ return true;
+}
+
+bool parseECs_stream(std::istream &in, BUSHeader &header)
+{
+ auto &ecs = header.ecs;
+ std::string line, t;
+ line.reserve(10000);
+
+ std::vector<int32_t> c;
+
+ int i = 0;
+ bool has_reached = false;
+ while (std::getline(in, line))
+ {
+ c.clear();
+ int ec = -1;
+ if (line.size() == 0)
+ {
+ continue;
+ }
+ std::stringstream ss(line);
+ ss >> ec;
+ assert(ec == i);
+ while (std::getline(ss, t, ','))
+ {
+ c.push_back(std::stoi(t));
+ }
+ if (!has_reached)
+ {
+ has_reached |= !(c.size() == 1 && c[0] == i);
+ if (has_reached)
+ {
+ std::cerr << "first line is " << i << '\n';
+ }
+ }
+
+ ecs.push_back(std::move(c));
+ ++i;
+ }
+ return true;
+}
bool parseECs(const std::string &filename, BUSHeader &header) {
auto &ecs = header.ecs;
@@ -294,3 +397,24 @@ bool writeHeader(std::ostream &outf, const BUSHeader &header) {
return true;
}
+
+bool writeCompressedHeader(std::ostream &outf, const compressed_BUSHeader &compheader)
+{
+ outf.write("BUS\1", 4);
+
+ // We start writing out the contents of the general header
+ const auto header = compheader.bus_header;
+ outf.write((char *)(&header.version), sizeof(header.version));
+ outf.write((char *)(&header.bclen), sizeof(header.bclen));
+ outf.write((char *)(&header.umilen), sizeof(header.umilen));
+ uint32_t tlen = header.text.size();
+ outf.write((char *)(&tlen), sizeof(tlen));
+ outf.write((char *)header.text.c_str(), tlen);
+
+ // We end by writing out the compressed-header-specific data
+ outf.write((char *)(&compheader.chunk_size), sizeof(compheader.chunk_size));
+ outf.write((char *)&compheader.pfd_blocksize, sizeof(compheader.pfd_blocksize));
+ outf.write((char *)(&compheader.lossy_umi), sizeof(compheader.lossy_umi));
+
+ return true;
+}
\ No newline at end of file
=====================================
src/BUSData.h
=====================================
@@ -28,6 +28,15 @@ struct BUSHeader {
BUSHeader() : version(0), bclen(0), umilen(0) {}
};
+struct compressed_BUSHeader
+{
+ uint32_t chunk_size;
+ uint32_t lossy_umi;
+ uint32_t pfd_blocksize = 512;
+ BUSHeader bus_header;
+ compressed_BUSHeader() : chunk_size(0), lossy_umi(0) {}
+};
+
struct BUSData {
uint64_t barcode;
uint64_t UMI;
@@ -38,11 +47,23 @@ struct BUSData {
BUSData() : barcode(0), UMI(0), ec(-1), count(0), flags(0), pad(0) {}
};
+enum BUSFILE_TYPE
+{
+ BUSFILE = 1,
+ BUSFILE_COMPRESED = 2,
+ BUSZ_INDEX = 3,
+ EC_MATRIX = 4,
+ EC_MATRIX_COMPRESSED = 5
+};
bool parseHeader(std::istream &inf, BUSHeader &header);
bool writeHeader(std::ostream &outf, const BUSHeader &header);
+bool parseCompressedHeader(std::istream &inf, compressed_BUSHeader &header);
+bool writeCompressedHeader(std::ostream &inf, const compressed_BUSHeader &header);
+int identifyParseHeader(std::istream &inf, BUSHeader &header, compressed_BUSHeader &comp_header);
+bool parseECs_stream(std::istream &in, BUSHeader &header);
bool parseECs(const std::string &filename, BUSHeader &header);
bool writeECs(const std::string &filename, const BUSHeader &header);
bool writeGenes(const std::string &filename, const std::unordered_map<std::string, int32_t> &genenames);
=====================================
src/Common.hpp
=====================================
@@ -10,7 +10,7 @@
#include <unordered_map>
#include <sstream>
-#define BUSTOOLS_VERSION "0.41.0"
+#define BUSTOOLS_VERSION "0.42.0"
enum CAPTURE_TYPE : char
{
@@ -102,6 +102,12 @@ struct Bustools_opt
/* linker */
int start, end;
+ /* Compression */
+ std::string busz_index;
+ uint32_t chunk_size = 100000;
+ uint32_t lossy_umi = 0;
+ uint32_t pfd_blocksize = 512;
+
Bustools_opt() : threads(1), max_memory(1ULL << 32), type(0),
threshold(0), start(-1), end(-1) {}
};
=====================================
src/bustools_compress.cpp
=====================================
@@ -0,0 +1,969 @@
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <algorithm>
+#include <vector>
+#include <array>
+#include <stdint.h>
+#include <zlib.h>
+
+#include "Common.hpp"
+#include "BUSData.h"
+#include "bustools_compress.h"
+
+size_t pfd_blocksize = 512;
+
+/**
+ * @brief Encode `num` using fibonacci encoding into buf, starting at bitpos.
+ * @pre the next 128 bits in `buf` are 0 starting from `bitpos`, and wrapped around 192.
+ * 0 <= bit_pos < 3*64 == 192
+ *
+ * @pre num > 0
+ * @post buf now contains the fibo encoding of num from [pre(bitpos); post(bitpos)].
+ * bit_pos has changed, 0 <= bitpos < 3*64 == 192.
+ * 0 or more elements in buf have been concentrated with fibo-encoded numbers and thus written to obuf.
+ *
+ * @note The largest fibonacci number that fits in a uint64_t is the 91st (0-indexed) fibo number, i.e. the first 92 fibonacci numbers fit in a 64-bit uint.
+ * Fibonacci encoding appends a single bit as a stop code. Hence, the longest fibonacci encoding uses 93 bits.
+ *
+ * @param num the number to encode, num > 0
+ * @param bufsize the size of the output buffer.
+ * @param buf array of `bufsize` uint64_t. num is fibonacci-encoded in buf.
+ * @param bitpos the bit position in buf where the next fibo encoding of num starts.
+ * @param obuf the ostream buffer to write to.
+ * @return bool true iff encoding the current number does not write outside of buf
+ */
+
+const auto fibo_begin = fibo64.begin();
+const auto fibo_end = fibo64.end();
+
+template <typename BUF_t>
+bool fiboEncode(const uint64_t num, const size_t bufsize, BUF_t *buf, size_t &bitpos)
+{
+ constexpr uint32_t word_size = sizeof(BUF_t) * 8;
+
+ const size_t max_bitpos = bufsize * word_size;
+ constexpr BUF_t ONE{1};
+
+ const uint32_t curr_byte_pos = bitpos / word_size;
+
+ uint64_t remainder = num;
+
+ // the ith fibonacci number is the largest fibo not greater than remainder
+ auto i = std::upper_bound(fibo_begin, fibo_end, remainder) - 1;
+
+ const uint32_t n_bits = (i - fibo_begin) + 2;
+
+ // Encoding the current number would write out of bounds of buf.
+ if (bitpos + n_bits > max_bitpos)
+ return false;
+
+ uint32_t next_bit_pos = bitpos + n_bits - 1,
+ bit_offset = next_bit_pos % word_size,
+ buf_offset = (next_bit_pos / word_size) % bufsize;
+
+ // Set the stop bit.
+ buf[buf_offset] |= ONE << (word_size - 1 - bit_offset);
+
+ ++i;
+ while (remainder > 0)
+ {
+ i = std::upper_bound(fibo_begin, i, remainder) - 1;
+ next_bit_pos = bitpos + (i - fibo_begin);
+ buf_offset = (next_bit_pos / word_size) % bufsize;
+ bit_offset = next_bit_pos % word_size;
+
+ buf[buf_offset] |= ONE << (word_size - 1 - bit_offset);
+ remainder -= *i;
+ }
+ bitpos += n_bits;
+ return true;
+}
+
+/**
+ * @brief pack elem into buf starting at bitpos, using exactly b_bits bits.
+ * @pre num is representable using `b_bits` bits.
+ * @pre buf has at least one element
+ * @pre buf has at least two elements if bitpos + b_bits > 8*sizeof(buf)..
+ *
+ * @param b_bits The number of bits to represent elem.
+ * @param elem The number to pack, must be representable with at most `b_bits` bits.
+ * @param buf The int64_t array where elem should be packed into.
+ * @param bitpos The starting point in bits of where to back elem.
+ * @return bool: true iff packing of elem saturates buf[0].
+ */
+template <typename T>
+bool pack_int(
+ const uint32_t b_bits,
+ T elem,
+ T *buf,
+ uint32_t &bitpos)
+{
+ constexpr int32_t dest_wordsize = sizeof(T) * 8;
+
+ // | | |
+ // ^ ^
+ // bitpos dest_wordsize
+ int32_t shift = dest_wordsize - bitpos - b_bits;
+ T carryover = 0;
+ constexpr T ONE{1};
+
+ if (shift < 0)
+ {
+ // b_bits > (dest_wordsize - bitpos)
+ // -> number covers two elements
+
+ carryover = elem & (ONE << -shift) - 1;
+ *(buf + 1) = carryover << dest_wordsize + shift;
+ elem >>= -shift;
+ }
+
+ // shift by max(0, shift)
+ *buf |= elem << (shift > 0) * shift;
+
+ bitpos = (dest_wordsize - shift) % dest_wordsize;
+ return (shift <= 0);
+}
+
+/**
+ * @brief Encode a block of integers using NewPFD.
+ * @pre BUF has a size of `pfd_block`.size() / 64 * `b_bits` elements.
+ *
+ * @param pfd_block The numbers to encode.
+ * @param index_gaps Output: delta encoded indices of exceptions in `pfd_block`.
+ * @param exceptions Output: The most significant bits of each exception..
+ * @param b_bits The number of bits to use per int for the block.
+ * @param min_element The smallest element in `pfd_block`.
+ * @param BUF The buffer to pack the elements of pfd_block into.
+ */
+template <typename SRC_T, typename DEST_T>
+void encode_pfd_block(
+ std::vector<SRC_T> &pfd_block,
+ std::vector<uint32_t> &index_gaps,
+ std::vector<SRC_T> &exceptions,
+ const uint32_t b_bits,
+ const SRC_T min_element,
+ DEST_T *BUF)
+{
+ index_gaps.clear();
+ exceptions.clear();
+
+ SRC_T max_elem_bit_mask = (1ULL << b_bits) - 1;
+ uint32_t bitpos = 0,
+ idx = 0,
+ last_ex_idx = 0;
+
+ bool do_increment_pointer = 0;
+
+ // Store the elements in the primary block in pfd_buf using `b_bits` bits each.
+ for (SRC_T &elem : pfd_block)
+ {
+ elem -= min_element;
+ if (elem > max_elem_bit_mask)
+ {
+ // store the overflowing, most significant bits as exceptions.
+ exceptions.push_back(elem >> b_bits);
+ index_gaps.push_back(idx - last_ex_idx);
+ last_ex_idx = idx;
+
+ // store the least b significant bits in the frame.
+ elem &= max_elem_bit_mask;
+ }
+
+ do_increment_pointer = pack_int<DEST_T>(b_bits, elem, BUF, bitpos);
+ BUF += do_increment_pointer;
+ ++idx;
+ }
+}
+
+/**
+ * @brief Encode a block of `block_size` elements in `pfd_block` and write to of.
+ *
+ * @param block_size The number of elements in the block.
+ * @param pfd_block The elements to encode.
+ * @param index_gaps Vector to store delta-encoded indices of exceptions..
+ * @param exceptions Vector to store the most significant bits of exceptions.
+ * @param fibonacci_buf array used for storing fibonacci encoding.
+ * @param b_bits The number of bits each num in `pfd_block` is represented with.
+ * @param min_element The smallest element in `pfd_block`.
+ * @param of The ostream to write out the encoding.
+ */
+
+template <typename SRC_T, typename DEST_T>
+size_t new_pfd(
+ const size_t block_size,
+ std::vector<SRC_T> &pfd_block,
+ std::vector<uint32_t> &index_gaps,
+ std::vector<SRC_T> &exceptions,
+ DEST_T *fibonacci_buf,
+ const size_t b_bits,
+ const SRC_T min_element,
+ size_t fibonacci_bufsize,
+ DEST_T *PFD_buf)
+{
+ bool success = true;
+ constexpr size_t wordsize = sizeof(DEST_T) * 8;
+
+ size_t buf_size = (block_size * b_bits) / wordsize;
+ std::fill(PFD_buf, PFD_buf + buf_size, 0ULL);
+
+ size_t bitpos{0};
+
+ encode_pfd_block<SRC_T, DEST_T>(pfd_block, index_gaps, exceptions, b_bits, min_element, PFD_buf);
+
+ size_t n_exceptions = index_gaps.size();
+ // For more compact fibonacci encoding, we pack the fibo encoded values together
+ success &= fiboEncode(b_bits + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(min_element + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(n_exceptions + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+
+ for (const auto &i : index_gaps)
+ {
+ // we must increment by one, since the first index gap can be zero
+ success &= fiboEncode(i + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+ for (const auto &ex : exceptions)
+ {
+ // These are always > 0 since they contain the most significant bits of exception
+ success &= fiboEncode(ex, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ size_t n_elems = bitpos / wordsize + (bitpos % wordsize > 0);
+ success &= (n_elems + buf_size <= fibonacci_bufsize);
+
+ std::memcpy(fibonacci_buf + n_elems, PFD_buf, buf_size * sizeof(DEST_T) * success);
+ n_elems += buf_size;
+
+ return n_elems * success;
+}
+
+/**
+ * @brief Compute the smallest element, and the number of bits required to encode at least 90% of
+ * the numbers in pfd_scratch.
+ * @pre pfd_scratch contains the numbers to encode in a block
+ * @post pfd_scratch may have been reordered.
+ * @param block_size The number of elements in pfd_scratch, i.e. the block size.
+ * @param pfd_scratch The numbers to encode in a single block using NewPFD.
+ * @param min_element Output: The smallest element in `pfd_scratch`.
+ * @param b_bits Output: The minimum number of bits that are enough to encode at least ~90% of the elements in `pfd_scratch`.
+ */
+
+template<typename T>
+void compute_pfd_params(
+ const size_t block_size,
+ std::vector<T> &pfd_scratch,
+ const T &min_element,
+ uint32_t &b_bits)
+{
+ const size_t nth_elem_idx = (size_t)((double)block_size * 0.9);
+
+ std::nth_element(pfd_scratch.begin(), pfd_scratch.begin() + nth_elem_idx, pfd_scratch.end());
+ T nth_max_element = pfd_scratch[nth_elem_idx];
+
+ b_bits = sizeof(T) * 8 - 1 - __builtin_clrsb(nth_max_element - min_element);
+ b_bits = b_bits ?: 1;
+}
+
+/**
+ * @brief Compress barcodes of rows using delta-runlen(0)-fibonacci encoding and write to `of`.
+ *
+ * @param rows BUSData array, contains at least `row_count` elements
+ * @param row_count The number of barcodes to compress.
+ * @param of The ostream for writing the encoding to.
+ * @return bool true iff encoding does not go out of bounds of obuf
+ */
+template <typename T>
+bool compress_barcodes(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ uint64_t barcode = 0,
+ last_bc = 0,
+ runlen = 0;
+ size_t wordsize = sizeof(T) * 8;
+
+ const size_t fibonacci_bufsize = (obuf_size - global_bufpos) / sizeof(T);
+ T *fibonacci_buf = (T *)(obuf + global_bufpos);
+ size_t bitpos{0};
+
+ for (int i = 0; i < row_count && success; ++i)
+ {
+ barcode = rows[i].barcode;
+
+ // delta encoding
+ barcode -= last_bc;
+
+ // Runlength encoding of zeros
+ if (barcode == 0)
+ {
+ ++runlen;
+ }
+ else
+ {
+ // Increment values as fibo cannot encode 0
+ if (runlen)
+ {
+ success &= fiboEncode(1ULL, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ runlen = 0;
+ }
+
+ if (barcode + 1 == 0)
+ {
+ std::cerr << "This input file needs sorting. Please sort this file and try again." << std::endl;
+ throw std::runtime_error("Input needs sorting prior to compression");
+ }
+
+ success &= fiboEncode(barcode + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+ last_bc = rows[i].barcode;
+ }
+
+ // Take care of the last run of zeros in the delta-encoded barcodes
+ if (runlen)
+ {
+ success &= fiboEncode(1ULL, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ global_bufpos += (bitpos / wordsize + (bitpos % wordsize > 0)) * sizeof(T);
+ return success;
+}
+
+/**
+ * @brief Compress UMIs of rows using periodic_delta-runlen(0)-fibonacci encoding and write to `of`.
+ *
+ * @param rows BUSData array, contains at least `row_count` elements
+ * @param row_count The number of UMIs to compress.
+ * @param of The ostream for writing the encoding to.
+ * @return bool true iff encoding does not go out of bounds of obuf
+ */
+template <typename T>
+bool lossless_compress_umis(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ uint64_t last_bc = rows[0].barcode + 1,
+ last_umi = 0,
+ bc, umi, diff;
+
+ size_t wordsize = sizeof(T) * 8;
+
+ const size_t fibonacci_bufsize = (obuf_size - global_bufpos) / sizeof(T);
+ T *fibonacci_buf = (T *)(obuf + global_bufpos);
+ size_t bitpos{0};
+
+ uint64_t runlen{0};
+ const uint32_t RLE_val{0ULL};
+
+ for (int i = 0; i < row_count && success; ++i)
+ {
+ bc = rows[i].barcode;
+
+ // We must increment umi, since a UMI==0 will confuse the runlength decoder.
+ umi = rows[i].UMI + 1;
+
+ if (last_bc != bc)
+ {
+ last_umi = 0;
+ }
+
+ diff = umi - last_umi;
+
+ if (diff == RLE_val)
+ {
+ ++runlen;
+ }
+ else
+ {
+ // Increment values for fibonacci encoding.
+ if (runlen)
+ {
+ success &= fiboEncode(RLE_val + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ runlen = 0;
+ }
+
+ if(diff + 1 == 0){
+ std::cerr << "This input file needs sorting. Please sort this file and try again." << std::endl;
+ throw std::runtime_error("Input needs sorting prior to compression");
+ }
+
+ success &= fiboEncode(diff + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ last_umi = umi;
+ last_bc = bc;
+ }
+
+ // Take care of the last run of zeros.
+ if (runlen)
+ {
+ success &= fiboEncode(RLE_val + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ global_bufpos += (bitpos / wordsize + (bitpos % wordsize > 0)) * sizeof(T);
+
+ return success;
+}
+
+template <typename T>
+bool lossy_compress_umis(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ std::cerr << "BEWARE: Lossy compression\n";
+ return success;
+}
+
+/**
+ * @brief Compress ECs of rows using NewPFD-fibonacci encoding and write to `of`.
+ *
+ * @param rows BUSData array, contains at least `row_count` elements
+ * @param row_count The number of ECs to compress.
+ * @param of The ostream for writing the encoding to.
+ * @return bool true iff encoding does not go out of bounds of obuf
+ */
+template <typename T>
+bool compress_ecs(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ size_t BLOCK_SIZE{pfd_blocksize};
+ size_t wordsize = sizeof(T) * 8;
+ size_t buf_offset = 0;
+
+ std::vector<uint32_t> index_gaps;
+ std::vector<int32_t> pfd_scratch,
+ pfd_block,
+ exceptions;
+
+ // todo: We might be able to speed up by creating the primary array here.
+ size_t max_size_block = BLOCK_SIZE * sizeof(int32_t);
+ T *primary_block = new T[max_size_block];
+
+ exceptions.reserve(BLOCK_SIZE);
+ index_gaps.reserve(BLOCK_SIZE);
+ pfd_scratch.reserve(BLOCK_SIZE);
+ pfd_block.reserve(BLOCK_SIZE);
+
+ const size_t fibonacci_bufsize = (obuf_size - global_bufpos) / sizeof(T);
+ T *fibonacci_buf = (T *)(obuf + global_bufpos);
+
+ int row_index{0};
+ int pfd_row_index{0};
+ size_t elems_written = 0;
+ size_t byte_count = 0;
+ while (row_index < row_count && success)
+ {
+ pfd_row_index = 0;
+ pfd_scratch.clear();
+ pfd_block.clear();
+ int32_t min_element = rows[row_index].ec,
+ curr_el;
+
+ while (pfd_row_index < BLOCK_SIZE && row_index < row_count)
+ {
+ curr_el = rows[row_index].ec;
+ pfd_scratch.push_back(curr_el);
+ pfd_block.push_back(curr_el);
+
+ min_element = (min_element < curr_el) * min_element + (curr_el <= min_element) * curr_el;
+ ++pfd_row_index;
+ ++row_index;
+ }
+
+ uint32_t b_bits = 0;
+ compute_pfd_params(pfd_row_index, pfd_scratch, min_element, b_bits);
+
+ // we don't want to reset the fibonacci bytes, as this is our primary buffer.
+ elems_written = new_pfd<int32_t, T>(
+ BLOCK_SIZE,
+ pfd_block,
+ index_gaps,
+ exceptions,
+ fibonacci_buf + buf_offset,
+ b_bits,
+ min_element,
+ fibonacci_bufsize - buf_offset,
+ primary_block);
+
+ success &= (elems_written > 0);
+ buf_offset += elems_written;
+ byte_count += elems_written * sizeof(T);
+ }
+
+ global_bufpos += byte_count;
+
+ delete[] primary_block;
+ return success;
+}
+
+/**
+ * @brief Compress counts of rows using runlength(1)-fibonacci encoding and write to `of`.
+ *
+ * @param rows BUSData array, contains at least `row_count` elements
+ * @param row_count The number of counts to compress.
+ * @param of The ostream for writing the encoding to.
+ * @return bool true iff encoding does not go out of bounds of obuf
+ */
+template <typename T>
+bool compress_counts(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ const uint32_t RLE_val{1UL};
+ uint32_t count;
+ uint64_t runlen{0};
+
+ size_t wordsize = sizeof(T) * 8;
+
+ const size_t fibonacci_bufsize = (obuf_size - global_bufpos) / sizeof(T);
+ T *fibonacci_buf = (T *)(obuf + global_bufpos);
+ size_t bitpos{0};
+
+ for (int i = 0; i < row_count && success; ++i)
+ {
+ count = rows[i].count;
+ if (count == RLE_val)
+ {
+ ++runlen;
+ }
+ else
+ {
+ if (runlen)
+ {
+ // Runlength-encode 1s.
+ success &= fiboEncode(RLE_val, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ runlen = 0;
+ }
+ success &= fiboEncode(count, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+ }
+ if (runlen)
+ {
+ // Runlength-encode last run of 1s.
+ success &= fiboEncode(RLE_val, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ global_bufpos += (bitpos / wordsize + (bitpos % wordsize > 0)) * sizeof(T);
+ return success;
+}
+
+/**
+ * @brief Compress flags of rows using runlength(0)-fibonacci encoding and write to `of`.
+ *
+ * @param rows BUSData array, contains at least `row_count` elements
+ * @param row_count The number of counts to compress.
+ * @param of The ostream for writing the encoding to.
+ * @return bool true iff encoding does not go out of bounds of obuf
+ */
+template <typename T>
+bool compress_flags(BUSData const *const rows, const int row_count, char *obuf, const size_t &obuf_size, size_t &global_bufpos)
+{
+ bool success = true;
+ const uint32_t RLE_val{0UL};
+ uint32_t flag,
+ runlen{0};
+
+ size_t wordsize = sizeof(T) * 8;
+
+ const size_t fibonacci_bufsize = (obuf_size - global_bufpos) / sizeof(T);
+ T *fibonacci_buf = (T *)(obuf + global_bufpos);
+ size_t bitpos{0};
+ // don't need to fill with zeros as that is done prior.
+
+ for (int i = 0; i < row_count && success; ++i)
+ {
+ flag = rows[i].flags;
+
+ if (flag == RLE_val)
+ {
+ ++runlen;
+ }
+ else
+ {
+ // Increment values as fibo cannot encode 0
+ if (runlen)
+ {
+ // Runlength-encode 0s (incremented).
+ success &= fiboEncode(RLE_val + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ runlen = 0;
+ }
+ success &= fiboEncode(flag + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+ }
+
+ if (runlen)
+ {
+ // Runlength-encode last run of 0s (incremented).
+ success &= fiboEncode(RLE_val + 1, fibonacci_bufsize, fibonacci_buf, bitpos);
+ success &= fiboEncode(runlen, fibonacci_bufsize, fibonacci_buf, bitpos);
+ }
+
+ global_bufpos += (bitpos / wordsize + (bitpos % wordsize > 0)) * sizeof(T);
+ return success;
+}
+
+typedef bool (*compress_ptr)(BUSData const *, const int, char *, const size_t &, size_t &);
+
+void compress_busfile(const Bustools_opt &opt, std::ostream &outf, std::istream &in, BUSHeader &h)
+{
+ constexpr size_t ROW_SIZE = sizeof(BUSData);
+
+ size_t N = opt.max_memory / ROW_SIZE;
+ const size_t chunk_size = (N < opt.chunk_size) ? N : opt.chunk_size;
+
+ compress_ptr compressors[5]{
+ &compress_barcodes<uint64_t>,
+ (opt.lossy_umi ? &lossy_compress_umis<uint64_t> : &lossless_compress_umis<uint64_t>),
+ &compress_ecs<uint32_t>,
+ &compress_counts<uint64_t>,
+ &compress_flags<uint64_t>,
+ };
+
+ compressed_BUSHeader comp_h;
+ comp_h.chunk_size = chunk_size;
+ comp_h.lossy_umi = opt.lossy_umi;
+
+ comp_h.bus_header.text = h.text;
+ comp_h.bus_header.version = h.version;
+ comp_h.bus_header.bclen = h.bclen;
+ comp_h.bus_header.umilen = h.umilen;
+
+ pfd_blocksize = opt.pfd_blocksize;
+ comp_h.pfd_blocksize = pfd_blocksize;
+
+ writeCompressedHeader(outf, comp_h);
+
+ std::vector<uint32_t> block_sizes;
+
+ // 6 * chunk_size is usually good enough, but we make it a multiple of 8;
+ size_t bufsize = (6 * chunk_size / 8) * 8;
+ size_t bufpos = 0;
+ size_t buf_checkpoint = 0;
+
+ uint64_t block_header = 0;
+ uint64_t row_count = 0;
+
+ BUSData *busdata;
+ char *buffer;
+ BUSZIndex busz_index(h, opt);
+
+ try
+ {
+ busdata = new BUSData[chunk_size];
+ buffer = new char[bufsize];
+
+ std::fill(buffer, buffer + bufsize, 0);
+ while (in.good())
+ {
+
+ in.read((char *)busdata, chunk_size * ROW_SIZE);
+ busz_index_add(busz_index, busdata[0].barcode, outf.tellp());
+ row_count = in.gcount() / ROW_SIZE;
+
+ for (int i_col = 0; i_col < 5; ++i_col)
+ {
+ bool success = compressors[i_col](busdata, row_count, buffer, bufsize, bufpos);
+ if (!success)
+ {
+ bufsize *= 2;
+
+ char *newbuf = new char[bufsize];
+ std::memcpy(newbuf, buffer, buf_checkpoint);
+ std::fill(newbuf + buf_checkpoint, newbuf + bufsize, 0);
+
+ delete[] buffer;
+ buffer = newbuf;
+
+ bufpos = buf_checkpoint;
+ --i_col;
+ }
+ buf_checkpoint = bufpos;
+ }
+
+ block_header = bufpos << 30;
+ block_header |= row_count;
+
+ outf.write((char *)&block_header, sizeof(block_header));
+ outf.write(buffer, bufpos);
+
+ std::fill(buffer, buffer + bufpos, 0);
+
+ block_sizes.push_back(bufpos);
+ buf_checkpoint = 0;
+ bufpos = 0;
+ }
+ block_header = 0;
+ outf.write((char *)&block_header, sizeof(block_header));
+
+ delete[] busdata;
+ delete[] buffer;
+ }
+ catch (const std::bad_alloc &ex)
+ {
+ std::cerr << "Unable to allocate buffer\n"
+ << ex.what() << std::endl;
+ delete[] busdata;
+ delete[] buffer;
+ exit(1);
+ }
+ catch(const std::runtime_error &ex){
+ delete[] busdata;
+ delete[] buffer;
+ exit(-1);
+ }
+
+ if (!opt.busz_index.empty())
+ {
+ std::ofstream ofindex;
+ ofindex.open(opt.busz_index);
+ busz_index.last_block = row_count ?: chunk_size;
+ write_BuszIndex(busz_index, ofindex);
+ }
+}
+
+void busz_index_add(BUSZIndex &index, uint64_t bc, uint64_t pos)
+{
+ index.barcodes.push_back(bc);
+ index.positions.push_back(pos);
+ ++index.n_blocks;
+}
+void write_BuszIndex(BUSZIndex &index, std::ostream &of)
+{
+
+ if(index.n_blocks > 1 && index.barcodes[index.n_blocks-2] == index.barcodes.back()){
+ index.barcodes.pop_back();
+ index.positions.pop_back();
+ --index.n_blocks;
+ }
+
+ of.write("BZI\0", 4);
+ of.write((char *)&index.n_blocks, sizeof(index.n_blocks));
+ of.write((char *)&index.block_size, sizeof(index.block_size));
+ of.write((char *)&index.last_block, sizeof(index.last_block));
+
+ for (int i = 0; i < index.n_blocks; ++i)
+ {
+ of.write((char *)&index.barcodes[i], sizeof(index.barcodes[i]));
+ of.write((char *)&index.positions[i], sizeof(index.positions[i]));
+ }
+}
+
+template <typename T>
+bool pack_ec_row_to_file(
+ const std::vector<int32_t> &ecs,
+ const size_t bufsize,
+ T *buf,
+ size_t &bitpos,
+ std::ostream &of)
+{
+ bool success = true;
+ // Diffs must be incremented since ec == 0 is valid, although rare.
+ constexpr int32_t RL_VAL{2};
+
+ size_t n_elems{ecs.size()};
+
+ uint32_t diff,
+ last_ec{0},
+ runlen{0};
+
+ success &= fiboEncode<T>(n_elems, bufsize, buf, bitpos);
+
+ const auto ecs_end = ecs.end();
+ for (auto ec_it = ecs.begin(); success && ec_it != ecs_end; ++ec_it)
+ {
+ auto &ec = *ec_it;
+ diff = ec - last_ec + 1;
+ if (diff == RL_VAL)
+ {
+ ++runlen;
+ }
+ else
+ {
+ if (runlen)
+ {
+ success &= fiboEncode<T>(RL_VAL, bufsize, buf, bitpos);
+ success &= fiboEncode<T>(runlen, bufsize, buf, bitpos);
+
+ runlen = 0;
+ }
+ success &= fiboEncode<T>(diff, bufsize, buf, bitpos);
+ }
+ last_ec = ec;
+ }
+ if (runlen && success)
+ {
+ success &= fiboEncode<T>(RL_VAL, bufsize, buf, bitpos);
+ success &= fiboEncode<T>(runlen, bufsize, buf, bitpos);
+ }
+
+ return success;
+}
+
+template <typename T = uint16_t>
+void compress_ec_matrix(std::istream &in, BUSHeader &h, const Bustools_opt &opt, std::ostream &of)
+{
+ // first m rows map to themselves. In the file, we count how many such rows there are.
+ // For the rest, we don't need the ids, only the delta+runlen-1+int-coding of the ECs
+ parseECs_stream(in, h);
+ std::cerr << "Done parsing ecs" << std::endl;
+ auto &ecs = h.ecs;
+
+ uint32_t lo = 0, hi = ecs.size() - 1, mid;
+
+ // Assume all i->i rows are at the beginning of file.
+ while (lo < hi)
+ {
+ // | ecs[i] = [i] | ? | ecs[i] != [i] |
+ // ^ ^ ^ ^
+ // 0 lo hi ecs.size()
+ mid = lo + (hi - lo + 1) / 2;
+ if (ecs.at(mid)[0] != mid || ecs.at(mid).size() != 1)
+ {
+ hi = mid - 1;
+ }
+ else
+ {
+ lo = mid;
+ }
+ }
+
+ size_t bufsize{600000};
+ T* fibonacci_buf = new T[bufsize];
+ std::fill(fibonacci_buf, fibonacci_buf + bufsize, 0);
+ const size_t wordsize = sizeof(T) * 8;
+
+ uint32_t num_identities = lo + 1;
+ uint32_t num_other = ecs.size() - num_identities;
+
+ of.write("BEC\0", 4);
+ of.write((char *)&num_identities, sizeof(num_identities));
+ of.write((char *)&num_other, sizeof(num_other));
+
+ size_t bitpos{0};
+ uint32_t row_count = 0;
+ size_t checkpoint = 0;
+ uint64_t block_header = 0;
+
+ const auto ecs_end = ecs.end();
+ auto ecs_it = ecs.begin() + lo + 1;
+ int i_row = 0;
+ uint64_t total_rows = 0;
+ try
+ {
+ while (ecs_it < ecs_end)
+ {
+ while(ecs_it < ecs_end && row_count < opt.chunk_size){
+ auto &ecs_list = *ecs_it;
+ bool success = pack_ec_row_to_file(ecs_list, bufsize, fibonacci_buf, bitpos, of);
+ if(!success)
+ {
+ size_t checkpoint_elem = checkpoint / wordsize + (checkpoint % wordsize > 0);
+ T *tmp_buf = new T[bufsize * 2];
+
+ std::fill(tmp_buf, tmp_buf + bufsize*2, 0);
+ std::memcpy(tmp_buf, fibonacci_buf, checkpoint_elem * sizeof(T));
+
+ bufsize *= 2;
+
+ delete[] fibonacci_buf;
+ fibonacci_buf = tmp_buf;
+ bitpos = checkpoint;
+ continue;
+ }
+
+ checkpoint = bitpos;
+ ++ecs_it;
+ ++row_count;
+ ++i_row;
+
+ }
+ total_rows += row_count;
+ uint64_t n_bytes = (bitpos / wordsize + (bitpos % wordsize > 0)) * sizeof(T);
+
+
+ block_header = n_bytes << 30;
+ block_header |= row_count;
+
+ of.write((char *)&block_header, sizeof(block_header));
+ of.write((char *)fibonacci_buf, n_bytes);
+
+ std::fill(fibonacci_buf, fibonacci_buf + bufsize, 0);
+ row_count = 0;
+ bitpos = 0;
+ }
+ }
+ catch (const std::bad_alloc &ex){
+ std::cerr << "Unable to allocate " << bufsize << " sized buffer\n";
+ std::cerr << ex.what() << std::endl;
+ }
+ block_header = 0;
+ of.write((char *)&block_header, sizeof(block_header));
+
+ delete[] fibonacci_buf;
+}
+
+void bustools_compress(const Bustools_opt &opt)
+{
+ BUSHeader h;
+ compressed_BUSHeader comph;
+
+ std::ofstream of;
+ std::streambuf *buf = nullptr;
+
+ if (opt.stream_out)
+ {
+ buf = std::cout.rdbuf();
+ }
+ else
+ {
+ of.open(opt.output);
+ buf = of.rdbuf();
+ }
+
+ std::ostream outf(buf);
+
+ for (const auto &infn : opt.files)
+ {
+ std::streambuf *inbuf;
+ std::ifstream inf;
+
+ if (opt.stream_in)
+ {
+ inbuf = std::cin.rdbuf();
+ }
+ else
+ {
+ inf.open(infn.c_str(), std::ios::binary);
+ inbuf = inf.rdbuf();
+ }
+
+ std::istream in(inbuf);
+ int target_file_type = identifyParseHeader(in, h, comph);
+
+ switch (target_file_type)
+ {
+ case BUSFILE_TYPE::BUSFILE:
+ // Compress a BUS file
+ compress_busfile(opt, outf, in, h);
+ break;
+ case BUSFILE_TYPE::BUSFILE_COMPRESED:
+ // decompress busz file
+ std::cerr << "Warning: The file " << infn << " is a compressed BUS file. Skipping.\n";
+ break;
+ case BUSFILE_TYPE::EC_MATRIX_COMPRESSED:
+ // Decompress matrix.ecz
+ std::cerr << "Warning: The file " << infn << " is a compressed EC matrix file. Skipping.\n";
+ break;
+ case BUSFILE_TYPE::EC_MATRIX:
+ // Compress matrix.ec
+ std::cerr << "Compressing matrix file " << infn << '\n';
+ compress_ec_matrix(in, h, opt, outf);
+ break;
+ case 0:
+ std::cerr << "Error: Unable to parse file" << std::endl;
+ break;
+ }
+ }
+}
=====================================
src/bustools_compress.h
=====================================
@@ -0,0 +1,36 @@
+#ifndef BUSTOOLS_COMPRESS_H
+#define BUSTOOLS_COMPRESS_H
+#include "Common.hpp"
+
+void bustools_compress(const Bustools_opt &opt);
+
+const std::vector<uint64_t> fibo64{1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049, 12586269025, 20365011074, 32951280099, 53316291173, 86267571272, 139583862445, 225851433717, 365435296162, 591286729879, 956722026041, 1548008755920, 2504730781961, 4052739537881, 6557470319842, 10610209857723, 17167680177565, 27777890035288, 44945570212853, 72723460248141, 117669030460994, 190392490709135, 308061521170129, 498454011879264, 806515533049393, 1304969544928657, 2111485077978050, 3416454622906707, 5527939700884757, 8944394323791464, 14472334024676221, 23416728348467685, 37889062373143906, 61305790721611591, 99194853094755497, 160500643816367088, 259695496911122585, 420196140727489673, 679891637638612258, 1100087778366101931, 1779979416004714189, 2880067194370816120, 4660046610375530309ULL, 7540113804746346429ULL, 12200160415121876738ULL};
+
+struct BUSZIndex {
+ public:
+ uint32_t bclen;
+ uint64_t n_blocks = 0;
+ uint64_t block_size;
+ uint64_t last_block;
+ std::vector<uint64_t> positions;
+ std::vector<uint64_t> barcodes;
+
+ BUSZIndex(const BUSHeader &h, const Bustools_opt &opt)
+ {
+ bclen = h.bclen;
+ block_size = opt.chunk_size;
+ positions.reserve(100);
+ barcodes.reserve(100);
+ }
+ BUSZIndex(const uint32_t _bclen, const int _block_size){
+ bclen = _bclen;
+ block_size = _block_size;
+ positions.reserve(100);
+ barcodes.reserve(100);
+ }
+};
+
+void write_BuszIndex(BUSZIndex &, std::ostream &);
+void busz_index_add(BUSZIndex &, uint64_t , uint64_t);
+
+#endif
\ No newline at end of file
=====================================
src/bustools_correct.cpp
=====================================
@@ -67,7 +67,7 @@ void bustools_split_correct(Bustools_opt &opt)
size_t stat_corr = 0;
size_t stat_corr_2 = 0;
size_t stat_uncorr = 0;
- uint64_t old_barcode;
+ uint64_t old_barcode = 0;
bool dump_bool = opt.dump_bool;
@@ -117,10 +117,10 @@ void bustools_split_correct(Bustools_opt &opt)
uint64_t mask_12; // = (1ULL << (2 * len_12)) - 1;
uint64_t mask_34; // = (1ULL << (2 * (len_34))) - 1;
- uint64_t mask_1 = (1ULL << (2 * len_1)) - 1;
- uint64_t mask_2 = (1ULL << (2 * len_2)) - 1;
- uint64_t mask_3 = (1ULL << (2 * len_3)) - 1;
- uint64_t mask_4 = (1ULL << (2 * len_4)) - 1;
+ //uint64_t mask_1 = (1ULL << (2 * len_1)) - 1;
+ //uint64_t mask_2 = (1ULL << (2 * len_2)) - 1;
+ //uint64_t mask_3 = (1ULL << (2 * len_3)) - 1;
+ //uint64_t mask_4 = (1ULL << (2 * len_4)) - 1;
while (std::getline(wf, line))
{
@@ -155,10 +155,10 @@ void bustools_split_correct(Bustools_opt &opt)
len_3 = len_34 / 2; //3, 4,4
len_4 = len_34 - len_3; //4, 4,4
- mask_1 = (1ULL << (2 * len_1)) - 1;
- mask_2 = (1ULL << (2 * len_2)) - 1;
- mask_3 = (1ULL << (2 * len_3)) - 1;
- mask_4 = (1ULL << (2 * len_4)) - 1;
+ uint64_t mask_1 = (1ULL << (2 * len_1)) - 1;
+ uint64_t mask_2 = (1ULL << (2 * len_2)) - 1;
+ uint64_t mask_3 = (1ULL << (2 * len_3)) - 1;
+ uint64_t mask_4 = (1ULL << (2 * len_4)) - 1;
// std::vector<std::pair<Roaring, Roaring>> correct(1ULL << (2 * bc2)); // 4^(bc/2) possible barcodes
=====================================
src/bustools_decompress.cpp
=====================================
@@ -0,0 +1,818 @@
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <algorithm>
+#include <vector>
+#include <array>
+#include <stdint.h>
+#include <zlib.h>
+
+#include <assert.h>
+#include "Common.hpp"
+#include "BUSData.h"
+#include "bustools_compress.h"
+#include "bustools_decompress.h"
+size_t d_pfd_blocksize = 512;
+/**
+ * @brief Decode a single fibonacci number from a buffer
+ * @pre buf contains at least n_buf elements.
+ * @pre 0 <= bitpos < 64
+ * @pre 0 <= bufpos < n_buf
+ *
+ * @param buf The buf to fibo-decode from.
+ * @param n_buf maximum number of elements in the buffer.
+ * @param bitpos a bit offset from the left within buf[bufpos], where the next bit will be read.
+ * @param bufpos offset from the beginning of buf.
+ *
+ * @return uint64_t num; the fibonacci decoded number. If buf is exhausted before the termination bit of
+ * a fibonacci number, num represents a partially decoded number.
+ */
+template <typename T, typename DEST_T>
+DEST_T fiboDecodeSingle(T const *const buf, const size_t n_buf, size_t &bitpos, size_t &bufpos)
+{
+ uint32_t i_fibo = 0;
+ size_t SIZE = sizeof(T) * 8;
+ size_t buf_offset = bufpos;
+ size_t bitshift = SIZE - (bitpos % SIZE) - 1;
+ DEST_T num{0};
+
+ int bit = (buf[buf_offset] >> bitshift) & 1;
+ int last_bit = 0;
+
+ ++bitpos;
+
+ while (last_bit + bit < 2 && buf_offset < n_buf)
+ {
+ last_bit = bit;
+ num += bit * (fibo64[i_fibo]);
+
+ buf_offset = bufpos + bitpos / SIZE;
+ bitshift = SIZE - (bitpos % SIZE) - 1;
+
+ bit = (buf[buf_offset] >> bitshift) & 1;
+
+ ++bitpos;
+ ++i_fibo;
+ }
+
+ assert(last_bit + bit == 2);
+ assert(buf_offset <= n_buf);
+
+ bufpos += (bitpos / SIZE);
+ bitpos %= SIZE;
+ return num;
+}
+
+
+template <size_t wordsize, typename T>
+inline T read_bit(const size_t bitpos, const T num)
+{
+ return (num >> (wordsize - 1 - bitpos)) & 1;
+}
+
+/**
+ * @brief Decodes a single fibonacci-encoded number from buffered stream..
+ * @invariant: 0 <= bitpos < 8*sizeof(SRC_T)
+ * 0 <= bufpos < bufsize, if bufsize != 0, 0 otherwise
+ * bitpos is the bit-index (from left) of the bit starting the fibo-encoding of the next value.
+ * bufpos is the index in buf starting the fibo-encoding of the next value.
+
+ * @tparam SRC_T the type of the read buffer.
+ * @tparam DEST_T the type of the decoded number.
+ * @param buf read buffer. Contains Fibonacci-encoded numbers. Has at least bufsize elements.
+ * @param bufsize The number of elements last read from `in`.
+ * @param bufpos The position in buf where we start reading from. 0 <= bufpos < bufsize.
+ * @param bitpos The bit position where the next fibonacci encoding starts within a single SRC_T word.
+ * @param in the stream to decode.
+ * @return DEST_T the next fibonacci decoded number if decoding terminates before eof, 0 otherwise.
+ */
+template <typename SRC_T, typename DEST_T>
+DEST_T decodeFibonacciStream(
+ SRC_T *buf,
+ size_t &bufsize,
+ size_t &bitpos,
+ size_t &bufpos,
+ std::istream &in)
+{
+ DEST_T num = 0;
+ uint32_t i_fibo{0};
+
+ constexpr size_t wordsize = sizeof(SRC_T) * 8;
+ size_t buf_offset = bufpos;
+ size_t bit_offset = wordsize - bitpos%wordsize -1;
+
+ SRC_T bit = read_bit<wordsize>(bitpos, buf[bufpos]),
+ last_bit = 0;
+
+ bitpos = (bitpos + 1) % wordsize;
+ bufpos += bitpos == 0;
+ if (bufpos == bufsize)
+ {
+ in.read((char *)buf, sizeof(SRC_T) * bufsize);
+ bufsize = in.gcount() / sizeof(SRC_T);
+ bufpos = 0;
+ }
+
+ while (!(last_bit && bit) && bufpos <= bufsize)
+ {
+ // 0 <= bufpos < bufsize if EOF is not reached
+ // 0 <= bitpos < wordsize
+
+ num += bit * fibo64[i_fibo++];
+ last_bit = bit;
+ bit = read_bit<wordsize>(bitpos, buf[bufpos]);
+ bitpos = (bitpos + 1) % wordsize;
+ bufpos += bitpos == 0;
+
+ if (bufpos >= bufsize)
+ {
+ in.read((char *)buf, sizeof(SRC_T) * bufsize);
+ bufsize = in.gcount() / sizeof(SRC_T);
+ bufpos = 0;
+ }
+ }
+
+ return num * (last_bit && bit);
+}
+
+template <typename T>
+uint64_t eliasDeltaDecode(T const *const buf, const size_t bufsize, uint32_t &bitpos, uint32_t &bufpos)
+{
+ size_t wordsize = sizeof(T) * 8;
+ size_t max_pos = wordsize - 1;
+ uint32_t L = 0, N = 0;
+ uint64_t bit{0};
+ size_t bit_offset = max_pos - (bitpos % wordsize);
+ // TODO: make buffer cyclical and write to obuf, just like fibonacci.
+
+ // EliasDelta(X): unary(L) + headless_binary(N+1) + headless_binary(num)
+ // N = len(headless_binary(num))
+ // L = len(headless_binary(N+1))
+
+ // Decode unary encoding of L
+ while (bufpos < bufsize && !(buf[bufpos] >> (max_pos - bitpos % wordsize) & 1))
+ {
+ ++L;
+ ++bitpos;
+ bufpos += (bitpos % wordsize == 0);
+ bit_offset = max_pos - (bitpos % wordsize);
+ }
+
+ // Decode binary encoding of N+1
+ for (int i = 0; i < L + 1 && bufpos < bufsize; ++i)
+ {
+ bit = buf[bufpos] >> (max_pos - bitpos % wordsize) & 1;
+ N += bit << (L - i);
+ ++bitpos;
+ bufpos += (bitpos % wordsize == 0);
+ }
+ --N;
+
+ uint64_t num = 1ULL << N;
+
+ // Decode headless binary encoding of num
+ // we can take this in chunks, as we are dealing with "normal" bits
+ for (int i = 0; i < N && bufpos < bufsize; ++i)
+ {
+ bit = buf[bufpos] >> (max_pos - bitpos % wordsize) & 1;
+ num += bit << (N - i - 1);
+ ++bitpos;
+ bufpos += (bitpos % wordsize == 0);
+ }
+
+ if (bufpos >= bufsize)
+ {
+ throw std::out_of_range("Encoding buffer out of bounds.");
+ }
+
+ bitpos %= wordsize;
+ return num;
+}
+
+/**
+ * @brief Finish NewPFD encoding after parsing to account for exceptions..
+ *
+ * @param primary A pointer to a block of packed fixed-width integers each of size `b_bits`.
+ * @param index_gaps Delta encoded indices of exceptions in `primary`.
+ * @param exceptions The most significant bits of exceptions, each one is always > 0.
+ * @param b_bits The number of bits used for packing numbers into `primary`.
+ */
+template <typename T>
+void updatePFD(const size_t block_size, T *primary, std::vector<uint32_t> &index_gaps, std::vector<uint32_t> &exceptions, uint32_t b_bits, const T min_element)
+{
+ int i_exception = 0;
+ int exception_pos = 0;
+ assert(index_gaps.size() == exceptions.size());
+ uint32_t index = 0;
+ int n = index_gaps.size();
+ for (int i = 0; i < n; ++i)
+ {
+ auto index_diff = index_gaps[i];
+ uint32_t ex = exceptions[i];
+ index += index_diff;
+ primary[index] |= (exceptions[i] << b_bits);
+ }
+ for (int i = 0; i < block_size; ++i)
+ {
+ primary[i] += min_element;
+ }
+}
+
+/**
+ * @brief
+ *
+ * @param buf
+ * @param max_elem
+ * @param n_ints
+ * @param b_bits
+ * @param primary
+ * @param bit_pos
+ * @param buf_offset
+ * @return size_t
+ */
+
+template <typename SRC_T, typename DEST_T>
+size_t PfdParsePrimaryBlock(
+ SRC_T *buf,
+ size_t bufsize,
+ const int n_ints,
+ const size_t b_bits,
+ DEST_T *primary,
+ size_t &bit_pos,
+ size_t &bufpos)
+{
+ int i = 0;
+ constexpr size_t wordsize = sizeof(SRC_T) * 8;
+ constexpr SRC_T ONE{1};
+
+ while (i < n_ints && bufpos < bufsize)
+ {
+ // I: 0 <= bit_pos < wordsize
+
+ // The max number of bit we can read from current element, no more than b_bits
+ uint32_t n_partial = std::min(b_bits, wordsize - bit_pos);
+ uint32_t bits_rem = b_bits - n_partial;
+
+ uint32_t shift = (wordsize - n_partial - bit_pos);
+ SRC_T mask = (ONE << n_partial) - 1;
+
+ // right shift extract from buf
+ // left shift num if the current number covers two elements in buf
+
+ DEST_T num = ((buf[bufpos] >> shift) & mask) << bits_rem;
+
+ bit_pos = (bit_pos + n_partial) % wordsize;
+
+ // increment bufpos if bit_pos == 0, i.e. the next element in `buf` is reached.
+ bufpos += (!bit_pos);
+
+ if (bits_rem)
+ {
+ shift = wordsize - bits_rem;
+ num |= buf[bufpos] >> shift;
+ bit_pos += bits_rem;
+ }
+ primary[i++] = num;
+ }
+
+ return bufpos;
+}
+
+/**
+ * @brief Decompress barcodes using fibonacci-runlength(0)-delta decoding.
+ *
+ * @param BUF The char array occupied by at least `row_count` encoded numbers, encoded using delta-runlenght(0)-fibonacci.
+ * @param rows The output BUSData array.
+ * @param row_count The number of BUSData elements in `rows`.
+ * @param buf_size The size of `BUF` in bytes.
+ */
+template <typename T>
+void decompress_barcode(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+ T *fibonacci_buf = (T *)(BUF + bufpos);
+
+ size_t fibonacci_bufsize = (buf_size - 1) / (sizeof(T)) + 1,
+ bitpos{0},
+ buf_offset{0},
+ row_index = 0;
+ uint64_t diff = 0,
+ barcode = 0,
+ runlen = 0;
+
+ const uint64_t RLE_VAL{0ULL};
+
+ while (row_index < row_count)
+ {
+ diff = fiboDecodeSingle<T, uint64_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset) - 1;
+
+ if (diff == RLE_VAL)
+ {
+ // Runlength decoding
+ runlen = fiboDecodeSingle<T, uint64_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset);
+ for (int i = 0; i < runlen; ++i)
+ {
+ rows[row_index].barcode = barcode;
+ ++row_index;
+ }
+ }
+ else
+ {
+ // Delta decoding
+ barcode += diff;
+ rows[row_index].barcode = barcode;
+ ++row_index;
+ }
+ }
+
+ buf_offset += bitpos > 0;
+ bufpos += buf_offset * sizeof(T);
+}
+
+/**
+ * @brief Decompress UMIS using fibonacci-runlength-periodic_delta decoding.
+ * @pre rows[0]..rows[row_count - 1] have its barcodes set from `decompress_barcodes`.
+ * @pre rows have at least `row_count` elements.
+ *
+ * @param BUF The encoded UMIs to decode.
+ * @param rows The output BUSData array whose UMI members are set in this function (up to `row_count`-1).
+ * @param row_count The number of elements in `rows` to decode.
+ * @param buf_size The size of the input array `BUF`.
+ */
+template<typename T>
+void decompress_lossless_umi(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+ T *fibonacci_buf = (T *)(BUF + bufpos);
+ size_t fibonacci_bufsize = (buf_size - 1) / (sizeof(T)) + 1,
+ row_index = 0;
+
+ size_t bitpos{0},
+ buf_offset{0};
+
+ uint64_t diff = 0,
+ // guarantee that last_barcode != rows[0].barcode
+ last_barcode = rows[0].barcode + 1,
+ umi = 0,
+ barcode,
+ runlen = 0;
+
+ const uint64_t RLE_VAL{0ULL};
+
+ while(row_index < row_count){
+ diff = fiboDecodeSingle<T, uint64_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset) - 1;
+ barcode = rows[row_index].barcode;
+ if (barcode != last_barcode)
+ {
+ umi = 0;
+ }
+
+ if (diff == RLE_VAL)
+ {
+ // diff is runlen encoded, next values are identical.
+ runlen = fiboDecodeSingle<T, uint64_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset);
+ for (int i = 0; i < runlen; ++i)
+ {
+ // we must decrement umi since we incremented it during compression
+ rows[row_index].UMI = umi - 1;
+ ++row_index;
+ }
+ }
+ else{
+ // Current element is a delta
+ umi += diff;
+ // we must decrement umi since we incremented it during compression
+ rows[row_index].UMI = umi - 1;
+ ++row_index;
+ }
+ last_barcode = barcode;
+ }
+
+ buf_offset += bitpos > 0;
+ bufpos += buf_offset * sizeof(T);
+}
+
+/**
+ * @brief Decompress UMIs which have been compressed in a lossy manner.
+ * @note Not yet implemented.
+ * @pre rows[0]..rows[row_count - 1] have its barcodes set from `decompress_barcodes`.
+ * @pre rows have at least `row_count` elements.
+ *
+ * @param BUF The encoded UMIs to decode
+ * @param rows The output BUSData array whose UMI members are set in this function (up to `row_count`-1).
+ * @param row_count The number of elements in `rows` to decode.
+ * @param buf_size The size of the input array `BUF`.
+ */
+template <typename T>
+void decompress_lossy_umi(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+}
+
+/**
+ * @brief Decompress ECs which have been compressed using NewPFD and fibonacci encoding.
+ * @param BUF The encoded ECs to decode
+ * @param rows The output BUSData array whose EC members are set in this function (up to `row_count`-1).
+ * @param row_count The number of elements in `rows` to decode.
+ * @param buf_size The size of the input array `BUF`.
+ */
+template <typename T>
+void decompress_ec(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+ T *pfd_buf = (T *)(BUF + bufpos);
+ size_t pfd_bufsize = (buf_size - bufpos) / sizeof(T);
+ size_t buf_offset{0};
+ size_t bitpos{0};
+
+ const size_t block_size = d_pfd_blocksize;
+
+ int32_t primary[block_size];
+ uint32_t b_bits = 1;
+ int32_t min_element{0};
+ uint64_t n_exceptions{0};
+
+ std::vector<uint32_t> exceptions;
+ std::vector<uint32_t> index_gaps;
+
+ size_t row_index = 0;
+ while (row_index < row_count)
+ {
+ index_gaps.clear();
+ exceptions.clear();
+
+ b_bits = fiboDecodeSingle<T, uint32_t>(pfd_buf, pfd_bufsize, bitpos, buf_offset) - 1;
+ min_element = fiboDecodeSingle<T, int32_t>(pfd_buf, pfd_bufsize, bitpos, buf_offset) - 1;
+ n_exceptions = fiboDecodeSingle<T, uint64_t>(pfd_buf, pfd_bufsize, bitpos, buf_offset) - 1;
+
+ for (int i = 0; i < n_exceptions; ++i)
+ {
+ index_gaps.push_back(fiboDecodeSingle<T, uint32_t>(pfd_buf, pfd_bufsize, bitpos, buf_offset) - 1);
+ }
+ for (int i = 0; i < n_exceptions; ++i)
+ {
+ exceptions.push_back(fiboDecodeSingle<T, uint32_t>(pfd_buf, pfd_bufsize, bitpos, buf_offset));
+ }
+
+ buf_offset += (bitpos > 0);
+ bitpos = 0;
+
+ buf_offset = PfdParsePrimaryBlock<T, int32_t>(pfd_buf, pfd_bufsize, block_size, b_bits, primary, bitpos, buf_offset);
+
+ updatePFD(block_size, primary, index_gaps, exceptions, b_bits, min_element);
+
+ for (int i = 0; i < block_size && row_index < row_count; ++i)
+ {
+ rows[row_index].ec = primary[i];
+ ++row_index;
+ }
+ }
+
+ buf_offset += bitpos > 0;
+ bufpos += buf_offset * sizeof(T);
+}
+
+/**
+ * @brief Decompress counts which have been compressed using runlen(1) and fibonacci encoding.
+ * @param BUF The encoded counts to decode
+ * @param rows The output BUSData array whose count members are set in this function (up to `row_count`-1).
+ * @param row_count The number of elements in `rows` to decode.
+ * @param buf_size The size of the input array `BUF`.
+ */
+template <typename T>
+void decompress_counts(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+ T *fibonacci_buf = (T *)(BUF + bufpos);
+ size_t fibonacci_bufsize{(buf_size - 1) / (sizeof(T)) + 1},
+ bitpos{0},
+ buf_offset{0},
+ row_index{0};
+
+ uint32_t curr_el{0},
+ runlen{0};
+
+ const uint32_t RLE_val{1U};
+
+ while(row_index < row_count)
+ {
+ curr_el = fiboDecodeSingle<T, uint32_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset);
+ runlen = curr_el == RLE_val ? fiboDecodeSingle<T, uint32_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset) : 1;
+
+ for (int i = 0; i < runlen; ++i){
+ rows[row_index].count = curr_el;
+ ++row_index;
+ }
+ }
+
+ buf_offset += (bitpos > 0);
+ bufpos += buf_offset * sizeof(T);
+}
+
+/**
+ * @brief Decompress counts which have been compressed using runlen(0) and fibonacci encoding.
+ * @param BUF The encoded counts to decode
+ * @param rows The output BUSData array whose count members are set in this function (up to `row_count`-1).
+ * @param row_count The number of elements in `rows` to decode.
+ * @param buf_size The size of the input array `BUF`.
+ */
+template <typename T>
+void decompress_flags(char *BUF, BUSData *rows, const size_t &row_count, const size_t &buf_size, size_t &bufpos)
+{
+ T *fibonacci_buf = (T *)(BUF + bufpos);
+ size_t fibonacci_bufsize{(buf_size - 1) / (sizeof(T)) + 1},
+ bitpos{0},
+ buf_offset{0},
+ row_index{0};
+
+ uint32_t curr_el{0},
+ runlen{0};
+
+ const uint32_t RLE_val{0U};
+
+ while(row_index < row_count)
+ {
+ curr_el = fiboDecodeSingle<T, uint32_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset) - 1;
+ runlen = curr_el == RLE_val ? fiboDecodeSingle<T, uint32_t>(fibonacci_buf, fibonacci_bufsize, bitpos, buf_offset) : 1;
+ for (int i = 0; i < runlen; ++i)
+ {
+ rows[row_index].flags = curr_el;
+ ++row_index;
+ }
+ }
+ buf_offset += bitpos > 0;
+ bufpos += buf_offset * sizeof(T);
+}
+
+typedef void (*decompress_ptr)(char *, BUSData *, const size_t &, const size_t &, size_t &);
+
+void decompress_block(
+ size_t row_count,
+ const size_t bufsize,
+ const decompress_ptr decompressors[5],
+ char *const BUF,
+ size_t &bufpos,
+ BUSData *busdata)
+{
+ for (int i = 0; i < 5; ++i)
+ {
+ size_t srclen = bufsize;
+ decompressors[i](BUF, busdata, row_count, srclen, bufpos);
+ }
+}
+
+template <typename T>
+int32_t decompress_ec_row(
+ std::vector<int32_t> &vec,
+ size_t bufsize,
+ T *buf,
+ std::istream &in,
+ size_t &bitpos,
+ size_t &bufpos
+)
+{
+ constexpr uint32_t RL_VAL{1};
+ uint32_t n_elems = fiboDecodeSingle<T, uint32_t>(buf, bufsize, bitpos, bufpos);
+
+ int i_elem{0};
+ uint32_t ec{0};
+ uint32_t runlen{0};
+ vec.clear();
+
+ vec.resize(n_elems);
+ while (i_elem < n_elems)
+ {
+ uint32_t diff = fiboDecodeSingle<T, uint32_t>(buf, bufsize, bitpos, bufpos) - 1;
+ if (diff == RL_VAL)
+ {
+ runlen = fiboDecodeSingle<T, uint32_t>(buf, bufsize, bitpos, bufpos);
+
+ for (int j = 0; j < runlen; ++j)
+ {
+ vec[i_elem++] = ++ec;
+ }
+ }
+ else
+ {
+ ec += diff;
+ vec[i_elem++] = ec;
+ }
+ }
+
+ return n_elems;
+}
+
+/**
+ * @brief Decompress a compress matrix file
+ * pre: inf.tellg() == 4
+ * the first 4 bytes of the stream were BEC\0
+ * @tparam T
+ * @param inf
+ * @param header
+ * @param bufsize
+ * @return bool
+ */
+template <typename T = uint16_t>
+bool decompress_matrix(std::istream &inf, BUSHeader &header, size_t bufsize=100000)
+{
+ char magic[4];
+ inf.read(magic, 4);
+ if(std::strcmp(magic, "BEC\0")) {
+ return false;
+ }
+
+ // Number of rows mapping to themselves.
+ uint32_t num_identities{0};
+ // Number of rows not mapping to themselves.
+ uint32_t num_rows{0};
+
+ inf.read((char *)&num_identities, sizeof(num_identities));
+ inf.read((char *)&num_rows, sizeof(num_rows));
+
+ std::vector<std::vector<int32_t>> &ecs = header.ecs;
+ ecs.resize(num_identities);
+
+ int32_t ec_idx = 0;
+ for (; ec_idx < num_identities; ++ec_idx)
+ {
+ ecs[ec_idx].push_back(ec_idx);
+ }
+
+ uint64_t block_header = 1;
+ bufsize = 600000;
+ uint64_t block_size_bytes = 0;
+ uint64_t block_count = 0;
+ uint64_t row_count_mask = (1 << 30) - 1;
+ try
+ {
+ T *buffer = new T[bufsize];
+ std::vector<int32_t> vec;
+ vec.reserve(10000);
+
+ size_t bitpos = 0;
+ size_t bufpos = 0;
+ int block_counter = 0;
+
+ inf.read((char *)&block_header, sizeof(block_header));
+ while (block_header)
+ {
+ block_size_bytes = block_header >> 30;
+ block_count = block_header & row_count_mask;
+
+ if (block_size_bytes > bufsize * sizeof(T)){
+ bufsize = block_size_bytes / sizeof(T);
+ delete[] buffer;
+ buffer = new T[bufsize];
+ }
+
+ inf.read((char *)buffer, block_size_bytes);
+ for (int i = 0; i < block_count; ++i)
+ {
+ vec.clear();
+ int32_t n_elems = decompress_ec_row(vec, bufsize, buffer, inf, bitpos, bufpos);
+ ecs.push_back(std::move(vec));
+ }
+
+ inf.read((char *)&block_header, sizeof(block_header));
+ bitpos = 0;
+ bufpos = 0;
+ block_counter++;
+ }
+
+ delete[] buffer;
+ }
+ catch (const std::bad_alloc &ex)
+ {
+ std::cerr << "Error: Unable to allocate buffer.\n\t"
+ << ex.what() << std::endl;
+ return false;
+ }
+ catch (const std::exception &exception)
+ {
+ std::cerr << "Error: Unable to decode EC matrix:\n\t"
+ << exception.what() << std::endl;
+ return false;
+ }
+
+ writeECs("output/matrix_test.ec", header);
+
+ return true;
+}
+
+void decompress_buszfile(std::istream &in, compressed_BUSHeader &comp_h, std::ostream &outf)
+{
+ writeHeader(outf, comp_h.bus_header);
+ BUSData *busdata = new BUSData[comp_h.chunk_size];
+
+ decompress_ptr decompressors[5]{
+ &decompress_barcode<uint64_t>,
+ comp_h.lossy_umi ? &decompress_lossy_umi<uint64_t> : &decompress_lossless_umi<uint64_t>,
+ &decompress_ec<uint32_t>,
+ &decompress_counts<uint64_t>,
+ &decompress_flags<uint64_t>,
+ };
+ d_pfd_blocksize = comp_h.pfd_blocksize;
+
+ try
+ {
+ uint64_t max_block_size = 6 * comp_h.chunk_size;
+ uint32_t i_chunk = 0;
+
+ char *BUF = new char[max_block_size];
+ size_t bufpos = 0;
+
+ uint64_t const row_count_mask = (1ULL << 30) - 1;
+ uint64_t block_header = 0,
+ row_count = 0,
+ block_size = 0;
+
+ in.read((char *)&block_header, sizeof(uint64_t));
+ while (block_header && in.good())
+ {
+ block_size = block_header >> 30;
+ row_count = block_header & row_count_mask;
+
+ if (block_size > max_block_size)
+ {
+ delete[] BUF;
+ max_block_size += block_size;
+ BUF = new char[max_block_size];
+ }
+
+ in.read((char *)BUF, block_size);
+ for (int i = 0; i < 5; ++i){
+ size_t srclen = block_size;
+ decompressors[i](BUF, busdata, row_count, srclen, bufpos);
+ }
+ // decompress_block(row_count, block_size, decompressors, BUF, bufpos, busdata);
+ outf.write((char *)busdata, row_count * sizeof(BUSData));
+ bufpos = 0;
+
+ in.read((char *)&block_header, sizeof(uint64_t));
+ ++i_chunk;
+ }
+
+ delete[] BUF;
+ }
+ catch (const std::bad_alloc &ex)
+ {
+ std::cerr << "Unable to allocate buffer" << std::endl;
+ }
+
+ delete[] busdata;
+}
+
+void bustools_decompress(const Bustools_opt &opt)
+{
+ compressed_BUSHeader comp_header;
+ BUSHeader header;
+ std::ofstream of;
+ std::streambuf *buf = nullptr;
+
+ if (opt.stream_out)
+ {
+ buf = std::cout.rdbuf();
+ }
+ else
+ {
+ of.open(opt.output);
+ buf = of.rdbuf();
+ }
+
+ std::ostream outf(buf);
+ for (const auto &infn : opt.files)
+ {
+ std::streambuf *inbuf;
+ std::ifstream inf;
+
+ if (opt.stream_in)
+ {
+ inbuf = std::cin.rdbuf();
+ }
+ else
+ {
+ inf.open(infn.c_str(), std::ios::binary);
+ inbuf = inf.rdbuf();
+ }
+ std::istream in(inbuf);
+
+ int target_file_type = identifyParseHeader(in, header, comp_header);
+ switch (target_file_type)
+ {
+ case 1:
+ std::cerr << "Warning: The file " << infn << " is an uncompressed BUS file. Skipping\n" ;
+ continue;
+ case 2:
+ // compressed bus file
+ decompress_buszfile(in, comp_header, outf);
+ continue;
+ case 3:
+ std::cerr << "Warning: The file " << infn << " is an uncompressed EC matrix file. Skipping\n";
+ continue;
+ case 4:
+ decompress_matrix(inf, comp_header.bus_header);
+ continue;
+ case 0:
+ std::cerr << "Error: Unable to parse or open file " << infn << '\n';
+ continue;
+ default:
+ std::cerr << "Warning: Unknown file type. Skipping.\n";
+ continue;
+ }
+ }
+}
\ No newline at end of file
=====================================
src/bustools_decompress.h
=====================================
@@ -0,0 +1,6 @@
+#ifndef BUSTOOLS_DECOMPRESS_H
+#define BUSTOOLS_DECOMPRESS_H
+#include "Common.hpp"
+
+void bustools_decompress(const Bustools_opt &opt);
+#endif
\ No newline at end of file
=====================================
src/bustools_inspect.cpp
=====================================
@@ -172,7 +172,12 @@ void bustools_inspect(Bustools_opt &opt) {
}
if (ecmap.size()) {
+ auto freq_targets_size = freq_targets.size();
for (const auto &target : ecmap[p[i].ec]) {
+ if (target >= freq_targets_size) {
+ freq_targets.resize(target+1, 0);
+ freq_targets_size = freq_targets.size();
+ }
++freq_targets[target];
}
uint32_t targetsInSet = ecmap[p[i].ec].size();
=====================================
src/bustools_main.cpp
=====================================
@@ -37,6 +37,8 @@
#include "bustools_predict.h"
#include "bustools_collapse.h"
#include "bustools_clusterhist.h"
+#include "bustools_compress.h"
+#include "bustools_decompress.h"
int my_mkdir(const char *path, mode_t mode)
{
@@ -960,6 +962,133 @@ void parse_ProgramOptions_extract(int argc, char **argv, Bustools_opt &opt)
}
}
+/**
+ * @brief Parse command line arguments for bustools inflate.
+ *
+ * @param argc
+ * @param argv
+ * @param opt
+ * @return bool true iff requested from the command line.
+ */
+bool parse_ProgramOptions_inflate(int argc, char **argv, Bustools_opt &opt)
+{
+ const char *opt_string = "o:ph";
+ bool print_usage = false;
+ static struct option long_options[] = {
+ {"output", required_argument, 0, 'o'},
+ {"pipe", no_argument, 0, 'p'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0},
+ };
+ int option_index = 0, c;
+ while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1)
+ {
+ switch (c)
+ {
+ case 'o':
+ opt.output = optarg;
+ break;
+ case 'p':
+ opt.stream_out = true;
+ break;
+ case 'h':
+ print_usage = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ while (optind < argc)
+ opt.files.push_back(argv[optind++]);
+
+ if (opt.files.size() == 1 && opt.files[0] == "-")
+ {
+ opt.stream_in = true;
+ }
+ return print_usage;
+}
+
+/**
+ * @brief Parse command line arguments for bustools compress.
+ *
+ * @param argc
+ * @param argv
+ * @param opt
+ * @return bool true iff requested from the command line.
+ */
+bool parse_ProgramOptions_compress(int argc, char **argv, Bustools_opt &opt)
+{
+ const char *opt_string = "N:Lo:pP:h:i::";
+ const bool lossy_umi_enabled = false;
+
+ static struct option long_options[] = {
+ {"chunk-size", required_argument, 0, 'N'},
+ {"lossy-umi", no_argument, 0, 'L'},
+ {"output", required_argument, 0, 'o'},
+ {"pipe", no_argument, 0, 'p'},
+ {"pfd-size", required_argument, 0, 'P'},
+ {"index", optional_argument, 0, 'i'},
+ {"help", no_argument, 0, 'h'},
+ {0, 0, 0, 0}};
+ int option_index = 0, c;
+ bool print_usage = false;
+
+ while ((c = getopt_long(argc, argv, opt_string, long_options, &option_index)) != -1)
+ {
+ switch (c)
+ {
+ case 'i':
+ {
+ if (optarg == NULL && optind < argc - 1 && argv[optind][0] != '-')
+ {
+ optarg = argv[optind++];
+ }
+ if (optarg == NULL)
+ {
+ opt.busz_index.assign("output.busz.idx");
+ }
+ else
+ {
+ opt.busz_index.assign(optarg);
+ }
+ break;
+ }
+ case 'N':
+ opt.chunk_size = atoi(optarg);
+ break;
+ case 'L':
+ if (!lossy_umi_enabled)
+ std::cerr << "Lossy UMI not yet implemented. Using lossless instead." << std::endl;
+ opt.lossy_umi = lossy_umi_enabled;
+ break;
+ case 'o':
+ opt.output = optarg;
+ break;
+ case 'p':
+ opt.stream_out = true;
+ break;
+ case 'P':
+ opt.pfd_blocksize = atoi(optarg);
+ break;
+ case 'h':
+ print_usage = true;
+ break;
+ default:
+ break;
+ }
+ }
+ // all other arguments are fast[a/q] files to be read
+ while (optind < argc)
+ opt.files.push_back(argv[optind++]);
+
+ if (opt.files.size() == 1 && opt.files[0] == "-")
+ {
+ opt.stream_in = true;
+ }
+ return print_usage;
+}
+
bool check_ProgramOptions_sort(Bustools_opt &opt)
{
@@ -2303,6 +2432,91 @@ bool check_ProgramOptions_extract(Bustools_opt &opt)
return ret;
}
+bool check_ProgramOptions_inflate(Bustools_opt &opt)
+{
+
+ bool ret = true;
+
+ if (!opt.stream_out)
+ {
+ if (opt.output.empty())
+ {
+ std::cerr << "Error: missing output file" << std::endl;
+ ret = false;
+ }
+ else if (!checkOutputFileValid(opt.output))
+ {
+ std::cerr << "Error: unable to open output file" << std::endl;
+ ret = false;
+ }
+ }
+
+ if (opt.files.size() != 1)
+ {
+ ret = false;
+ if (opt.files.size() == 0)
+ {
+ std::cerr << "Error: Missing BUSZ input file" << std::endl;
+ }
+ else
+ {
+ std::cerr << "Error: Multiple files not yet supported" << std::endl;
+ }
+ }
+ else if (!opt.stream_in)
+ {
+ if (!checkFileExists(opt.files[0]))
+ {
+ std::cerr << "Error: File not found, " << opt.files[0] << std::endl;
+ ret = false;
+ }
+ }
+
+ return ret;
+}
+
+bool check_ProgramOptions_compress(Bustools_opt &opt)
+{
+ bool ret = true;
+
+ if (!opt.stream_out)
+ {
+ if (opt.output.empty())
+ {
+ std::cerr << "Error: missing output file" << std::endl;
+ ret = false;
+ }
+ else if (!checkOutputFileValid(opt.output))
+ {
+ std::cerr << "Error: unable to open output file" << std::endl;
+ ret = false;
+ }
+ }
+
+ if (opt.files.size() != 1)
+ {
+ ret = false;
+ if (opt.files.size() == 0)
+ {
+ std::cerr << "Error: Missing BUS input file" << std::endl;
+ }
+ else
+ {
+ std::cerr << "Error: Multiple files not yet supported" << std::endl;
+ }
+ }
+ else if (!opt.stream_in)
+ {
+ if (!checkFileExists(opt.files[0]))
+ {
+ std::cerr << "Error: File not found, " << opt.files[0] << std::endl;
+ ret = false;
+ }
+ }
+
+ return ret;
+}
+
void Bustools_Usage()
{
std::cout << "bustools " << BUSTOOLS_VERSION << std::endl
@@ -2326,6 +2540,8 @@ void Bustools_Usage()
<< "collapse Turn BUS files into a BUG file" << std::endl
<< "clusterhist Create UMI histograms per cluster" << std::endl
<< "linker Remove section of barcodes in BUS files" << std::endl
+ << "compress Compress a BUS file" << std::endl
+ << "inflate Decompress a BUSZ (compressed BUS) file" << std::endl
<< "version Prints version number" << std::endl
<< "cite Prints citation information" << std::endl
<< std::endl
@@ -2557,6 +2773,31 @@ void Bustools_extract_Usage()
<< std::endl;
}
+void Bustools_compress_Usage()
+{
+ std::cout << "Usage: bustools compress [options] sorted-bus-file" << std::endl
+ << "Note: BUS file should be sorted by barcode-umi-ec" << std::endl
+ << std::endl
+ << "Options: " << std::endl
+ << "-N, --chunk-size CHUNK_SIZE Number of rows to compress as a single block." << std::endl
+ // << "-L, --lossy-umi Allow lossy compression over UMIs. Each UMI will be renamed for minimal compression." << std::endl
+ << "-o, --output OUTPUT Write compressed file to OUTPUT." << std::endl
+ << "-p, --pipe Write to standard output." << std::endl
+ << "-h, --help Print this message and exit." << std::endl
+ << std::endl;
+}
+
+void Bustools_inflate_Usage()
+{
+ std::cout << "Usage: bustools {inflate | decompress} [options] compressed-bus-file" << std::endl
+ << std::endl
+ << "Options: " << std::endl
+ << "-p, --pipe Write to standard output." << std::endl
+ << "-o, --output OUTPUT File for inflated output." << std::endl
+ << "-h, --help Print this message and exit." << std::endl
+ << std::endl;
+}
+
void print_citation()
{
std::cout << "When using this program in your research, please cite" << std::endl
@@ -2915,6 +3156,52 @@ int main(int argc, char **argv)
exit(1);
}
}
+ else if(cmd == "compress")
+ {
+ if (disp_help)
+ {
+ Bustools_compress_Usage();
+ exit(0);
+ }
+ if (parse_ProgramOptions_compress(argc - 1, argv + 1, opt))
+ {
+ Bustools_compress_Usage();
+ exit(0);
+ }
+ else if (check_ProgramOptions_compress(opt))
+ {
+ bustools_compress(opt);
+ exit(0);
+ }
+ else
+ {
+ Bustools_compress_Usage();
+ exit(1);
+ }
+ }
+ else if(cmd == "inflate" || cmd == "decompress")
+ {
+ if (disp_help)
+ {
+ Bustools_inflate_Usage();
+ exit(0);
+ }
+ if (parse_ProgramOptions_inflate(argc - 1, argv + 1, opt))
+ {
+ Bustools_inflate_Usage();
+ exit(0);
+ }
+ else if (check_ProgramOptions_inflate(opt))
+ {
+ bustools_decompress(opt);
+ exit(0);
+ }
+ else
+ {
+ Bustools_inflate_Usage();
+ exit(1);
+ }
+ }
else
{
std::cerr << "Error: invalid command " << cmd << std::endl;
=====================================
src/bustools_mash.cpp
=====================================
@@ -2,6 +2,7 @@
#include <fstream>
#include <algorithm>
#include <queue>
+#include <vector>
#include <functional>
#include <unordered_map>
#include <unordered_set>
@@ -176,7 +177,7 @@ void bustools_mash(const Bustools_opt &opt)
int32_t nr = 0, nw = 0;
int32_t N = 1024;
- std::queue<BUSData> queue[nf];
+ std::vector<std::queue<BUSData>> queue(nf);
for (int i = 0; i < nf; i++)
{
=====================================
src/bustools_merge.cpp
=====================================
@@ -83,7 +83,7 @@ void bustools_merge_different_index(const Bustools_opt &opt)
// put the ecs into a ecmap inv
std::unordered_map<std::vector<int32_t>, int32_t, SortedVectorHasher> ecmapinv;
- for (int32_t ec; ec < h.ecs.size(); ec++)
+ for (std::size_t ec = 0; ec < h.ecs.size(); ec++)
{
ecmapinv.insert({h.ecs[ec], ec});
}
=====================================
src/bustools_sort.cpp
=====================================
@@ -11,6 +11,14 @@
#define TP std::pair<BUSData, int>
+//This code is for automatically creating the tmp directory supplied if it doesn't exist
+#if defined(__MINGW32__) || defined(_MSC_VER)
+//#include <filesystem> //once filesystem is acceptable for minGW, switch to that
+#include "windows.h" //Needed for CreateDirectory
+
+#endif
+
+
inline bool cmp1(const BUSData &a, const BUSData &b)
{
if (a.barcode == b.barcode)
@@ -322,8 +330,22 @@ inline bool ncmp5(const TP &a, const TP &b)
void bustools_sort(const Bustools_opt &opt)
{
+ auto mem = opt.max_memory;
+ //There is a bug in Windows, where bustools sort fails. The problem is that
+ //gcount for some reason fails here if too much is read and returns 0, even though
+ //it succeeds. Could perhaps be a 32 bit issue somewhere, does size_t become 32 bits?
+ //Anyway, this is a workaround that fixes the issue - does the same as the flag -m 100000000.
+ //An interesting observation is that opt.max_memory is set to 1 << 32, which will become exactly
+ //zero if truncated to 32 bits...
+#if defined(__MINGW32__) || defined(_MSC_VER)
+ const size_t win_mem_max = 1e8;
+ if (mem > win_mem_max)
+ {
+ mem = win_mem_max;
+ }
+#endif
BUSHeader h;
- size_t N = opt.max_memory / sizeof(BUSData);
+ size_t N = mem / sizeof(BUSData);
BUSData *p = new BUSData[N];
char magic[4];
uint32_t version = 0;
@@ -359,6 +381,36 @@ void bustools_sort(const Bustools_opt &opt)
exit(1);
}
+#if defined(__MINGW32__) || defined(_MSC_VER)
+ //Make sure to create the tmp directory if it doesn't exist - writing temporary files fails otherwise in Windows
+ //First get the directory - in theory, opt.temp_files can look like "tmp/x_" or just "x_" (or even nothing)
+ //so we should find the last slash and make sure that directory exists
+
+ std::size_t ind = opt.temp_files.rfind('/');
+ std::size_t ind2 = opt.temp_files.rfind('\\');
+ if (ind == std::string::npos)
+ {
+ ind = ind2;
+ }
+ else if (ind2 != std::string::npos)
+ {
+ //both valid, take the largest value (representing the last slash)
+ ind = std::max(ind, ind2);
+ }
+ if (ind != std::string::npos)
+ {
+ auto dirName = opt.temp_files.substr(0, ind);
+ //When our MinGW builds support c++17, change to std::filesystem
+
+ //std::filesystem::path filepath = dirName;
+ //if (!std::filesystem::is_directory(filepath))
+ //{
+ // std::filesystem::create_directory(filepath);
+ //}
+ CreateDirectory(dirName.c_str(), NULL); //This will do nothing if the directory exists already
+ }
+#endif
+
size_t sc = 0;
int tmp_file_no = 0;
for (const auto &infn : opt.files)
=====================================
src/bustools_whitelist.cpp
=====================================
@@ -35,8 +35,8 @@ void bustools_whitelist(Bustools_opt &opt) {
int wl_count = 0;
uint32_t bc_r = 0, bc_u = 0;
- uint64_t curr_umi;
- uint64_t curr_bc;
+ uint64_t curr_umi = 0;
+ uint64_t curr_bc = 0;
int bc_count = -1;
/* Determine threshold. */
View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/3f436bca439f6be16ed0cdba3e2194fcdd8b2339
--
View it on GitLab: https://salsa.debian.org/med-team/bustools/-/commit/3f436bca439f6be16ed0cdba3e2194fcdd8b2339
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221231/40f37407/attachment-0001.htm>
More information about the debian-med-commit
mailing list