[med-svn] [Git][med-team/xxsds-dynamic][upstream] New upstream version 1.0~alpha.1+git20210426.548c6f7
Andreas Tille (@tille)
gitlab at salsa.debian.org
Mon Dec 13 19:09:09 GMT 2021
Andreas Tille pushed to branch upstream at Debian Med / xxsds-dynamic
Commits:
4cdccbd6 by Andreas Tille at 2021-12-13T20:07:20+01:00
New upstream version 1.0~alpha.1+git20210426.548c6f7
- - - - -
2 changed files:
- h0_lz77.cpp
- include/dynamic/algorithms/h0_lz77.hpp
Changes:
=====================================
h0_lz77.cpp
=====================================
@@ -23,65 +23,105 @@
using namespace std;
using namespace dyn;
+ulint sa_rate = 0;
+bool int_file = false;
+
+void help(){
+
+cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
+ cout << "Usage: h0_lz77 [options] <input_file> <output_file> " << endl;
+ cout << "Options: " << endl;
+ cout << "-s <sample_rate> store one SA sample every sample_rate positions. default: 256." << endl;
+ cout << "-i Interpret the file as a stream of 32-bits integers." << endl;
+ cout << "input_file: file to be parsed" << endl;
+ cout << "output_file: LZ77 triples <start,length,trailing_character> will be saved in binary format in this file" << endl << endl;
+ cout << "Note: the file should terminate with a character (or int if -i) not appearing elsewhere." << endl;
+
+ exit(0);
+
+}
+
+
+void parse_args(char** argv, int argc, int &ptr){
+
+ assert(ptr<argc);
+
+ string s(argv[ptr]);
+ ptr++;
+
+ if(s.compare("-s")==0){
+
+ sa_rate = atoi(argv[ptr++]);
+
+ }else if(s.compare("-i")==0){
+
+ int_file = true;
+
+ }else{
+ cout << "Error: unrecognized '" << s << "' option." << endl;
+ help();
+ }
+
+}
+
+
int main(int argc,char** argv) {
using std::chrono::high_resolution_clock;
using std::chrono::duration_cast;
using std::chrono::duration;
- if(argc!=3 and argc !=4){
+ if(argc < 3) help();
- cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
- cout << "Usage: h0_lz77 [sample_rate] <input_file> <output_file> " << endl;
- cout << " sample_rate: store one SA sample every sample_rate positions. default: 256." << endl;
- cout << " input_file: file to be parsed" << endl;
- cout << " output_file: LZ77 triples <start,length,char> will be saved in text format in this file" << endl;
+ //parse options
- exit(0);
+ int ptr = 1;
- }
+ if(argc<3) help();
- using lz77_t = h0_lz77<wt_fmi>;
+ while(ptr<argc-2)
+ parse_args(argv, argc, ptr);
- /*
- * uncomment this (and comment the above line) to use instead a
- * run-length encoded FM index.
- */
- //using lz77_t = h0_lz77<rle_fmi>;
+ string in = string(argv[ptr++]);
+ string out = string(argv[ptr]);
- auto t1 = high_resolution_clock::now();
+ using lz77_t = h0_lz77<wt_fmi>;
lz77_t lz77;
ulint DEFAULT_SA_RATE = lz77_t::DEFAULT_SA_RATE;
- ulint sa_rate = argc == 3 ? DEFAULT_SA_RATE : atoi(argv[1]);
+ sa_rate = not sa_rate ? DEFAULT_SA_RATE : sa_rate;
- sa_rate = sa_rate == 0 ? 1 : sa_rate;
+ auto t1 = high_resolution_clock::now();
- string in(argv[1+(argc==4)]);
- string out(argv[2+(argc==4)]);
cout << "Sample rate is " << sa_rate << endl;
- {
+ if(not int_file){
- cout << "Detecting alphabet ... " << flush;
- std::ifstream ifs(in);
+ {
+ cout << "Detecting alphabet ... " << flush;
+ std::ifstream ifs(in);
- lz77 = lz77_t(ifs, sa_rate);
- ifs.close();
+ lz77 = lz77_t(ifs, sa_rate);
- cout << "done." << endl;
+ cout << "done." << endl;
+ }
- }
+ std::ifstream ifs(in);
+ std::ofstream os(out, ios::binary);
+
+ lz77.parse(ifs,os,1,true);
- std::ifstream ifs(in);
- std::ofstream os(out);
+ }else{
- lz77.parse(ifs,os,15,true);
+ lz77 = lz77_t(~uint(0), sa_rate);
+ std::ifstream ifs(in, ios::binary);
+ std::ofstream os(out, ios::binary);
- ifs.close();
- os.close();
+ lz77.parse_int(ifs,os,1,true);
+
+ }
auto t2 = high_resolution_clock::now();
=====================================
include/dynamic/algorithms/h0_lz77.hpp
=====================================
@@ -104,9 +104,7 @@ public:
* input: an input stream and an output stream
* the algorithms scans the input (just 1 scan) and
* saves to the output stream (could be a file) a series
- * of triples <pos,len,c> of type <ulint,ulint,uchar>. Types
- * are converted to char* before streaming them to out
- * (i.e. ulint to 8 bytes and uchar to 1 byte). len is the length
+ * of triples <pos,len,c> of type <ulint,ulint,uchar>. len is the length
* of the copied string (i.e. excluded skipped characters in the end)
*
* after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
@@ -184,12 +182,9 @@ public:
exit(0);
}
- auto start = (char*)(new ulint(p));
- auto l = (char*)(new ulint(len));
-
- out.write(start,sizeof(ulint));
- out.write(l,sizeof(ulint));
- out.write(&cc,1);
+ out.write((char*)&p,sizeof(ulint));
+ out.write((char*)&len,sizeof(ulint));
+ out.write((char*)&cc,sizeof(cc));
gamma_bits += gamma(uint64_t(backward_pos+1));
gamma_bits += gamma(uint64_t(len+1));
@@ -199,10 +194,6 @@ public:
delta_bits += delta(uint64_t(len+1));
delta_bits += delta(uint64_t(uint8_t(cc)));
-
- delete start;
- delete l;
-
z++;
len = 0;
p = 0;
@@ -248,6 +239,140 @@ public:
}
+ /*
+ * input: an input integer stream (32 bits) and an output stream
+ * the algorithms scans the input (just 1 scan) and
+ * saves to the output stream (could be a file) a series
+ * of triples <pos,len,c> of type <ulint,ulint,int>. len is the length
+ * of the copied string (i.e. excluded skipped characters in the end)
+ *
+ * after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
+ * sparsification, experimental)
+ *
+ * to get also the last factor, input stream should
+ * terminate with a character that does not appear elsewhere
+ * in the stream
+ *
+ */
+ void parse_int(istream& in, ostream& out, ulint skip = 1, bool verbose = false){
+
+ //size of the output if this is compressed using gamma/delta encoding
+ uint64_t gamma_bits = 0;
+ uint64_t delta_bits = 0;
+
+ assert(skip>0);
+
+ long int step = 100000; //print status every step characters
+ long int last_step = 0;
+
+ assert(fmi.size()==1); //only terminator
+
+ pair<ulint, ulint> range = fmi.get_full_interval(); //BWT range of current phrase
+
+ ulint len = 0; //length of current LZ phrase
+ ulint i = 0; //position of terminator character in bwt
+ ulint p = 0; //phrase occurrence
+
+ ulint z = 0; //number of LZ77 phrases
+
+ if(verbose) cout << "Parsing ..." << endl;
+
+ int cc;
+ ulint n = 0;
+ while(in.read((char*)&cc,sizeof(int))){
+
+ n++;
+ //cout << cc;
+
+ if(verbose){
+
+ if(n>last_step+(step-1)){
+
+ last_step = n;
+ cout << " " << n << " integers processed ..." << endl;
+
+ }
+
+ }
+
+ uint c(cc);
+
+ auto new_range = fmi.LF(range,c);
+
+ if(new_range.first >= new_range.second){
+
+ //cout << ":";
+
+ //empty range: new factor
+
+ ulint occ;
+
+ if(len>0){
+
+ occ = i == range.first ? range.second-1 : range.first;
+ p = fmi.locate(occ) - len;
+
+ }
+
+ fmi.extend(c);
+
+ uint64_t backward_pos = len == 0 ? 0 : (fmi.text_length() - len - 1) - p;
+
+ if(backward_pos > fmi.text_length()){
+ cout << "err" << endl;
+ exit(0);
+ }
+
+ out.write((char*)&p,sizeof(ulint));
+ out.write((char*)&len,sizeof(ulint));
+ out.write((char*)&cc,sizeof(cc));
+
+ z++;
+ len = 0;
+ p = 0;
+
+ //skip characters
+
+ ulint k = 0;
+
+ while(k < skip-1 && in.read((char*)&cc,sizeof(int))){
+
+ //cout << cc;
+
+ fmi.extend(uint(cc));
+ k++;
+ n++;
+
+ }
+
+ //cout << "|";
+
+ range = fmi.get_full_interval();
+
+ }else{
+
+ len++; //increase current phrase length
+ fmi.extend(c); //insert character c in the BWT
+ i = fmi.get_terminator_position(); //get new terminator position
+ range = {new_range.first, new_range.second+1}; //new suffix falls inside current range: extend
+
+ }
+
+
+ }
+
+ if(verbose){
+
+ cout << "\nNumber of integers: " << n << endl;
+ cout << "Number of LZ77 phrases: " << z << endl;
+
+
+ }
+
+
+ }
+
+
/*
* Total number of bits allocated in RAM for this structure
*
View it on GitLab: https://salsa.debian.org/med-team/xxsds-dynamic/-/commit/4cdccbd6673723de83722382dfcb09b39b598312
--
View it on GitLab: https://salsa.debian.org/med-team/xxsds-dynamic/-/commit/4cdccbd6673723de83722382dfcb09b39b598312
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211213/685a848e/attachment-0001.htm>
More information about the debian-med-commit
mailing list