[med-svn] [Git][med-team/xxsds-dynamic][upstream] New upstream version 1.0~alpha.1+git20210426.548c6f7

Andreas Tille (@tille) gitlab at salsa.debian.org
Mon Dec 13 19:09:09 GMT 2021



Andreas Tille pushed to branch upstream at Debian Med / xxsds-dynamic


Commits:
4cdccbd6 by Andreas Tille at 2021-12-13T20:07:20+01:00
New upstream version 1.0~alpha.1+git20210426.548c6f7
- - - - -


2 changed files:

- h0_lz77.cpp
- include/dynamic/algorithms/h0_lz77.hpp


Changes:

=====================================
h0_lz77.cpp
=====================================
@@ -23,65 +23,105 @@
 using namespace std;
 using namespace dyn;
 
+ulint sa_rate = 0;
+bool int_file = false;
+
+void help(){
+
+cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
+		cout << "Usage: h0_lz77 [options] <input_file> <output_file> " << endl;
+		cout << "Options: " << endl;
+		cout << "-s <sample_rate>   store one SA sample every sample_rate positions. default: 256." << endl;
+		cout << "-i                 Interpret the file as a stream of 32-bits integers." << endl;
+		cout << "input_file: file to be parsed" << endl;
+		cout << "output_file: LZ77 triples <start,length,trailing_character> will be saved in binary format in this file" << endl << endl;
+		cout << "Note: the file should terminate with a character (or int if -i) not appearing elsewhere." << endl;
+
+		exit(0);
+
+}
+
+
+void parse_args(char** argv, int argc, int &ptr){
+
+	assert(ptr<argc);
+
+	string s(argv[ptr]);
+	ptr++;
+
+	if(s.compare("-s")==0){
+
+		sa_rate = atoi(argv[ptr++]);
+
+	}else if(s.compare("-i")==0){
+
+		int_file = true;
+
+	}else{
+		cout << "Error: unrecognized '" << s << "' option." << endl;
+		help();
+	}
+
+}
+
+
 int main(int argc,char** argv) {
 
 	using std::chrono::high_resolution_clock;
 	using std::chrono::duration_cast;
 	using std::chrono::duration;
 
-	if(argc!=3 and argc !=4){
+	if(argc < 3) help();
 
-		cout << "Build LZ77 using a zero-order compressed FM index." << endl << endl;
-		cout << "Usage: h0_lz77 [sample_rate] <input_file> <output_file> " << endl;
-		cout << "   sample_rate: store one SA sample every sample_rate positions. default: 256." << endl;
-		cout << "   input_file: file to be parsed" << endl;
-		cout << "   output_file: LZ77 triples <start,length,char> will be saved in text format in this file" << endl;
+	//parse options
 
-		exit(0);
+	int ptr = 1;
 
-	}
+	if(argc<3) help();
 
-	using lz77_t = h0_lz77<wt_fmi>;
+	while(ptr<argc-2)
+		parse_args(argv, argc, ptr);
 
-	/*
-	 * uncomment this (and comment the above line) to use instead a
-	 * run-length encoded FM index.
-	 */
-	//using lz77_t = h0_lz77<rle_fmi>;
+	string in = string(argv[ptr++]);
+	string out = string(argv[ptr]);
 
-	auto t1 = high_resolution_clock::now();
+	using lz77_t = h0_lz77<wt_fmi>;
 
 	lz77_t lz77;
 	ulint DEFAULT_SA_RATE = lz77_t::DEFAULT_SA_RATE;
 
-	ulint sa_rate = argc == 3 ? DEFAULT_SA_RATE : atoi(argv[1]);
+	sa_rate = not sa_rate ? DEFAULT_SA_RATE : sa_rate;
 
-	sa_rate = sa_rate == 0 ? 1 : sa_rate;
+	auto t1 = high_resolution_clock::now();
 
-	string in(argv[1+(argc==4)]);
-	string out(argv[2+(argc==4)]);
 
 	cout << "Sample rate is " << sa_rate << endl;
 
-	{
+	if(not int_file){
 
-		cout << "Detecting alphabet ... " << flush;
-		std::ifstream ifs(in);
+		{
+			cout << "Detecting alphabet ... " << flush;
+			std::ifstream ifs(in);
 
-		lz77 = lz77_t(ifs, sa_rate);
-		ifs.close();
+			lz77 = lz77_t(ifs, sa_rate);
 
-		cout << "done." << endl;
+			cout << "done." << endl;
+		}
 
-	}
+		std::ifstream ifs(in);
+		std::ofstream os(out, ios::binary);
+
+		lz77.parse(ifs,os,1,true);
 
-	std::ifstream ifs(in);
-	std::ofstream os(out);
+	}else{
 
-	lz77.parse(ifs,os,15,true);
+		lz77 = lz77_t(~uint(0), sa_rate);
+		std::ifstream ifs(in, ios::binary);
+		std::ofstream os(out, ios::binary);
 
-	ifs.close();
-	os.close();
+		lz77.parse_int(ifs,os,1,true);
+
+	}
 
 	auto t2 = high_resolution_clock::now();
 


=====================================
include/dynamic/algorithms/h0_lz77.hpp
=====================================
@@ -104,9 +104,7 @@ public:
 	 * input: an input stream and an output stream
 	 * the algorithms scans the input (just 1 scan) and
 	 * saves to the output stream (could be a file) a series
-	 * of triples <pos,len,c> of type <ulint,ulint,uchar>. Types
-	 * are converted to char* before streaming them to out
-	 * (i.e. ulint to 8 bytes and uchar to 1 byte). len is the length
+	 * of triples <pos,len,c> of type <ulint,ulint,uchar>. len is the length
 	 * of the copied string (i.e. excluded skipped characters in the end)
 	 *
 	 * after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
@@ -184,12 +182,9 @@ public:
 					exit(0);
 				}
 
-				auto start = (char*)(new ulint(p));
-				auto l = (char*)(new ulint(len));
-
-				out.write(start,sizeof(ulint));
-				out.write(l,sizeof(ulint));
-				out.write(&cc,1);
+				out.write((char*)&p,sizeof(ulint));
+				out.write((char*)&len,sizeof(ulint));
+				out.write((char*)&cc,sizeof(cc));
 
 				gamma_bits += gamma(uint64_t(backward_pos+1));
 				gamma_bits += gamma(uint64_t(len+1));
@@ -199,10 +194,6 @@ public:
 				delta_bits += delta(uint64_t(len+1));
 				delta_bits += delta(uint64_t(uint8_t(cc)));
 
-
-				delete start;
-				delete l;
-
 				z++;
 				len = 0;
 				p = 0;
@@ -248,6 +239,140 @@ public:
 
 	}
 
+	/*
+	 * input: an input integer stream (32 bits) and an output stream
+	 * the algorithms scans the input (just 1 scan) and
+	 * saves to the output stream (could be a file) a series
+	 * of triples <pos,len,c> of type <ulint,ulint,int>. len is the length
+	 * of the copied string (i.e. excluded skipped characters in the end)
+	 *
+	 * after the end of a phrase, skip 'skip'>0 characters, included trailing character (LZ77
+	 * sparsification, experimental)
+	 *
+	 * to get also the last factor, input stream should
+	 * terminate with a character that does not appear elsewhere
+	 * in the stream
+	 *
+	 */
+	void parse_int(istream& in, ostream& out, ulint skip = 1, bool verbose = false){
+
+		//size of the output if this is compressed using gamma/delta encoding
+		uint64_t gamma_bits = 0;
+		uint64_t delta_bits = 0;
+
+		assert(skip>0);
+
+		long int step = 100000;	//print status every step characters
+		long int last_step = 0;
+
+		assert(fmi.size()==1);	//only terminator
+
+		pair<ulint, ulint> range = fmi.get_full_interval();	//BWT range of current phrase
+
+		ulint len = 0;	//length of current LZ phrase
+		ulint i = 0;	//position of terminator character in bwt
+		ulint p = 0;	//phrase occurrence
+
+		ulint z = 0; 	//number of LZ77 phrases
+
+		if(verbose) cout << "Parsing ..." << endl;
+
+		int cc;
+		ulint n = 0;
+		while(in.read((char*)&cc,sizeof(int))){
+
+			n++;
+			//cout << cc;
+
+			if(verbose){
+
+				if(n>last_step+(step-1)){
+
+					last_step = n;
+					cout << " " << n << " integers processed ..." << endl;
+
+				}
+
+			}
+
+			uint c(cc);
+
+			auto new_range = fmi.LF(range,c);
+
+			if(new_range.first >= new_range.second){
+
+				//cout << ":";
+
+				//empty range: new factor
+
+				ulint occ;
+
+				if(len>0){
+
+					occ = i == range.first ? range.second-1 : range.first;
+					p = fmi.locate(occ) - len;
+
+				}
+
+				fmi.extend(c);
+
+				uint64_t backward_pos = len == 0 ? 0 : (fmi.text_length() - len - 1) - p;
+
+				if(backward_pos > fmi.text_length()){
+					cout << "err" << endl;
+					exit(0);
+				}
+
+				out.write((char*)&p,sizeof(ulint));
+				out.write((char*)&len,sizeof(ulint));
+				out.write((char*)&cc,sizeof(cc));
+
+				z++;
+				len = 0;
+				p = 0;
+
+				//skip characters
+
+				ulint k = 0;
+
+				while(k < skip-1 && in.read((char*)&cc,sizeof(int))){
+
+					//cout << cc;
+
+					fmi.extend(uint(cc));
+					k++;
+					n++;
+
+				}
+
+				//cout << "|";
+
+				range = fmi.get_full_interval();
+
+			}else{
+
+				len++;			//increase current phrase length
+				fmi.extend(c);	//insert character c in the BWT
+				i = fmi.get_terminator_position();				//get new terminator position
+				range = {new_range.first, new_range.second+1};	//new suffix falls inside current range: extend
+
+			}
+
+
+		}
+
+		if(verbose){
+
+			cout << "\nNumber of integers: " << n << endl;
+			cout << "Number of LZ77 phrases: " << z << endl;
+
+
+		}
+
+
+	}
+
+
 	/*
 	 * Total number of bits allocated in RAM for this structure
 	 *



View it on GitLab: https://salsa.debian.org/med-team/xxsds-dynamic/-/commit/4cdccbd6673723de83722382dfcb09b39b598312

-- 
View it on GitLab: https://salsa.debian.org/med-team/xxsds-dynamic/-/commit/4cdccbd6673723de83722382dfcb09b39b598312
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20211213/685a848e/attachment-0001.htm>


More information about the debian-med-commit mailing list