[med-svn] [hinge] 02/12: New upstream version 0.5
Afif Elghraoui
afif at moszumanska.debian.org
Fri Oct 20 03:34:36 UTC 2017
This is an automated email from the git hooks/post-receive script.
afif pushed a commit to branch master
in repository hinge.
commit e288b49d26c4fac11409e30769a77c4ec2875153
Author: Afif Elghraoui <afif at debian.org>
Date: Tue Oct 17 23:01:54 2017 -0400
New upstream version 0.5
---
demo/NCTC9657_demo/run.sh | 4 ++
demo/ecoli_P4_demo/run.sh | 5 +-
demo/ecoli_demo/run.sh | 4 +-
demo/ecoli_nanopore/run.sh | 4 +-
scripts/draw_pileup_region.py | 21 ++++--
scripts/pruning_and_clipping_nanopore.py | 0
src/consensus/consensus.cpp | 22 ++++--
src/consensus/draft.cpp | 113 +++++++++++++++++++++++--------
src/filter/filter.cpp | 113 +++++++++++++++++++++++--------
src/include/LAInterface.h | 3 +
src/layout/hinging.cpp | 107 +++++++++++++++++++++++------
src/maximal/maximal.cpp | 107 ++++++++++++++++++++---------
utils/update.sh | 2 +-
13 files changed, 374 insertions(+), 131 deletions(-)
diff --git a/demo/NCTC9657_demo/run.sh b/demo/NCTC9657_demo/run.sh
index 9bdbf72..6f7dda0 100644
--- a/demo/NCTC9657_demo/run.sh
+++ b/demo/NCTC9657_demo/run.sh
@@ -14,8 +14,12 @@ mkdir log
hinge filter --db NCTC9657 --las NCTC9657 --mlas -x NCTC9657 --config ../../utils/nominal.ini
+
+hinge maximal --db NCTC9657 --las NCTC9657 --mlas -x NCTC9657 --config ../../utils/nominal.ini
+
hinge layout --db NCTC9657 --las NCTC9657.las -x NCTC9657 --config ../../utils/nominal.ini -o NCTC9657
+
hinge clip NCTC9657.edges.hinges NCTC9657.hinge.list demo
hinge draft-path $PWD NCTC9657 NCTC9657demo.G2.graphml
diff --git a/demo/ecoli_P4_demo/run.sh b/demo/ecoli_P4_demo/run.sh
index 5b9e9b3..40f0ec0 100644
--- a/demo/ecoli_P4_demo/run.sh
+++ b/demo/ecoli_P4_demo/run.sh
@@ -17,7 +17,10 @@ mkdir log
-hinge filter --db ecoli --las "ecoli.*.las" -x ecoli --config ../../utils/nominal.ini
+hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini
+
+hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini
+
hinge layout --db ecoli --las ecoli.las -x ecoli --config ../../utils/nominal.ini -o ecoli
hinge clip ecoli.edges.hinges ecoli.hinge.list demo
diff --git a/demo/ecoli_demo/run.sh b/demo/ecoli_demo/run.sh
index 9640f26..30e9deb 100644
--- a/demo/ecoli_demo/run.sh
+++ b/demo/ecoli_demo/run.sh
@@ -22,12 +22,12 @@ hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal
hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini
-hinge layout --db ecoli --las ecoli.las -x ecoli --config ../../utils/nominal.ini -o ecoli
+hinge layout --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini -o ecoli
hinge clip ecoli.edges.hinges ecoli.hinge.list demo
hinge draft-path $PWD ecoli ecolidemo.G2.graphml
-hinge draft --db ecoli --las ecoli.las --prefix ecoli --config ../../utils/nominal.ini --out ecoli.draft
+hinge draft --db ecoli --las ecoli --mlas --prefix ecoli --config ../../utils/nominal.ini --out ecoli.draft
diff --git a/demo/ecoli_nanopore/run.sh b/demo/ecoli_nanopore/run.sh
index 9953ae4..db66ea0 100644
--- a/demo/ecoli_nanopore/run.sh
+++ b/demo/ecoli_nanopore/run.sh
@@ -17,8 +17,10 @@ DASqv -c100 ecoli ecoli.las
mkdir log
-
hinge filter --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini
+
+hinge maximal --db ecoli --las ecoli --mlas -x ecoli --config ../../utils/nominal.ini
+
hinge layout --db ecoli --las ecoli.las -x ecoli --config ../../utils/nominal.ini -o ecoli
hinge clip-nanopore ecoli.edges.hinges ecoli.hinge.list demo
diff --git a/scripts/draw_pileup_region.py b/scripts/draw_pileup_region.py
index 31dfd11..5e7d75b 100755
--- a/scripts/draw_pileup_region.py
+++ b/scripts/draw_pileup_region.py
@@ -43,13 +43,13 @@ for i,item in enumerate(util.get_alignments_mapping3(ref, read, las, contig)):
if item[3] >= left and item[4] <= right and item[4] - item[3] > length_th:
aln.append(item)
-
-
-
+
+
+
covy = np.zeros((right - left, ))
for item in aln:
covy[item[3] - left : item[4] - left] += 1
-
+
covx = np.arange(left, right)
@@ -69,7 +69,8 @@ for item in aln:
else:
aln_group.append(item)
-num = len(alns)
+#num = len(alns)
+num = len(aln)
print len(aln), len(alns)
@@ -102,7 +103,15 @@ ax1.add_line(dotted_line)
dotted_line2 = plt.Line2D((right, right), (0, num*grid_size ),ls='-.')
ax1.add_line(dotted_line2)
-for i,aln_group in enumerate(alns):
+alns_all = []
+for item in alns:
+ for aln in item:
+ alns_all.append([aln])
+
+alns_all.sort(key = lambda x:min([item[3] for item in x]))
+
+
+for i,aln_group in enumerate(alns_all):
for item in aln_group:
abpos = item[3]
aepos = item[4]
diff --git a/scripts/pruning_and_clipping_nanopore.py b/scripts/pruning_and_clipping_nanopore.py
old mode 100644
new mode 100755
diff --git a/src/consensus/consensus.cpp b/src/consensus/consensus.cpp
index 578d855..b6df26f 100644
--- a/src/consensus/consensus.cpp
+++ b/src/consensus/consensus.cpp
@@ -59,6 +59,20 @@ char toLower(char c) {
}
+int remove_multialign(std::vector<LAlignment *> idx, int idx_size, int LENGTH_THRESHOLD)
+{
+ int i, j, r=0;
+
+ for(i = 0; i < idx_size; i ++) {
+ if (idx[i]->aepos - idx[i]->abpos >= LENGTH_THRESHOLD) {
+ for(j = 0; j < r; j ++)
+ if(idx[j]->read_B_id_ == idx[i]->read_B_id_) break;
+ if(j == r)
+ idx[r++] = idx[i];
+ }
+ }
+ return r;
+}
int main(int argc, char *argv[]) {
@@ -135,13 +149,7 @@ int main(int argc, char *argv[]) {
for (int i = 0; i < n_contigs; i++) {
-
- int k = 0;
- for (k = 0; k < idx[i].size(); k++)
- if (idx[i][k]->aepos - idx[i][k]->abpos < LENGTH_THRESHOLD)
- break;
-
- int seq_count = k;
+ int seq_count = remove_multialign(idx[i], idx[i].size(),LENGTH_THRESHOLD);
std::cout << "Contig " << i << ": " << seq_count << " reads" << std::endl;
diff --git a/src/consensus/draft.cpp b/src/consensus/draft.cpp
index 702a7a7..6da5344 100644
--- a/src/consensus/draft.cpp
+++ b/src/consensus/draft.cpp
@@ -40,6 +40,34 @@ typedef adjacency_list <vecS, vecS, undirectedS> Graph;
typedef std::tuple<Node, Node, int> Edge_w;
+inline std::vector<std::string> glob(const std::string& pat){
+ using namespace std;
+ glob_t glob_result;
+ int i = 1;
+ std::string search_name;
+ search_name = pat + "."+std::to_string(i)+".las";
+ std::cout << search_name << endl;
+ glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result);
+
+ vector<string> ret;
+
+ while (glob_result.gl_pathc != 0){
+ ret.push_back(string(glob_result.gl_pathv[0]));
+ i ++;
+ search_name = pat + "."+std::to_string(i)+".las";
+ glob(search_name.c_str(),GLOB_TILDE,NULL,&glob_result);
+// std::cout << "Number of files " << glob_result.gl_pathc << std::endl;
+ }
+
+ std::cout << "-------------------------"<< std::endl;
+ std::cout << "Number of files " << i-1 << std::endl;
+ std::cout << "Input string " << pat.c_str() << std::endl;
+ std::cout << "-------------------------"<< std::endl;
+
+ globfree(&glob_result);
+ return ret;
+}
+
std::vector<int> get_mapping(std::string aln_tag1, std::string aln_tag2) {
int pos = 0;
int count = 0;
@@ -686,18 +714,6 @@ int draft_assembly_ctg(std::vector<Edge_w> & edgelist, LAInterface & la, std::ve
-inline std::vector<std::string> glob(const std::string& pat){
- using namespace std;
- glob_t glob_result;
- glob(pat.c_str(),GLOB_TILDE,NULL,&glob_result);
- vector<string> ret;
- for(unsigned int i=0;i<glob_result.gl_pathc;++i){
- ret.push_back(string(glob_result.gl_pathv[i]));
- }
- globfree(&glob_result);
- return ret;
-};
-
int main(int argc, char *argv[]) {
@@ -712,6 +728,7 @@ int main(int argc, char *argv[]) {
cmdp.add<std::string>("log", 'g', "log folder name", false, "log");
cmdp.add<std::string>("path", 0, "path file name", false, "path");
cmdp.add("debug", '\0', "debug mode");
+ cmdp.add("mlas", '\0', "multiple las files");
// cmdp.add<std::string>("restrictreads",'r',"restrict to reads in the file",false,"");
@@ -786,22 +803,24 @@ int main(int argc, char *argv[]) {
std::vector<std::string> name_las_list;
std::string name_las_str(name_las);
- if (name_las_str.find('*') != -1)
+ if (cmdp.exist("mlas")) {
name_las_list = glob(name_las_str);
+ console->info("calling glob.");
+ }
else
name_las_list.push_back(name_las_str);
- if (strlen(name_las) > 0)
- la.openAlignmentFile(name_las);
+ //if (strlen(name_las) > 0)
+ // la.openAlignmentFile(name_las);
int64 n_aln = 0;
- if (strlen(name_las) > 0) {
- n_aln = la.getAlignmentNumber();
- console->info("Load alignments from {}", name_las);
- console->info("# Alignments: {}", n_aln);
- }
+ //if (strlen(name_las) > 0) {
+ // n_aln = la.getAlignmentNumber();
+ // console->info("Load alignments from {}", name_las);
+ // console->info("# Alignments: {}", n_aln);
+ //}
int n_read;
if (strlen(name_db) > 0) {
@@ -816,8 +835,6 @@ int main(int argc, char *argv[]) {
console->info("# Reads: {}", n_read); // output some statistics
-
-
if (strlen(name_db) > 0) {
la.getRead(reads, 0, n_read);
}
@@ -841,26 +858,65 @@ int main(int argc, char *argv[]) {
reads[i]->active = maximal_read[i];
}
+ // start loading and cleaning
+ int number_of_parts;
+ number_of_parts = name_las_list.size();
std::vector<int> range;
for (int i = 0; i < n_read; i++) {
if (reads[i]->active) range.push_back(i);
}
-
std::sort(range.begin(), range.end());
std::vector<LOverlap *> aln;//Vector of pointers to all alignments
std::vector<LAlignment *> full_aln;//Vector of pointers to all alignments
-
-
- if (strlen(name_las) > 0) {
+ std::vector<LOverlap *> aln_to_remove;//Vector of pointers to all alignments
+ std::vector<LAlignment *> full_aln_to_remove;//Vector of pointers to all alignments
+
+
+ for (int part = 0; part < number_of_parts; part++) {
+ console->info("part:{}", part);
+ console->info("name of las {}", name_las_list[part]);
+ la.openAlignmentFile(name_las_list[part]);
+ int64 n_aln_part = 0;
+ n_aln_part = la.getAlignmentNumber();
+ n_aln += n_aln_part;
+ console->info("Load alignment from {}", name_las_list[part]);
+ console->info("# Alignments: {}", n_aln_part);
la.resetAlignment();
- la.getOverlap(aln, range);
+ la.getOverlap(aln_to_remove, range);
la.resetAlignment();
- la.getAlignment(full_aln, range);
+ la.getAlignment(full_aln_to_remove, range);
+
+ for (int j = 0; j < aln_to_remove.size(); j++) {
+ if ((reads[aln_to_remove[j]->read_A_id_]->active) && (reads[aln_to_remove[j]->read_B_id_]->active)) {
+ aln.push_back(aln_to_remove[j]);
+ }
+ else delete aln_to_remove[j];
+ }
+
+ for (int j = 0; j < full_aln_to_remove.size(); j++) {
+ if ((reads[full_aln_to_remove[j]->read_A_id_]->active) && (reads[full_aln_to_remove[j]->read_B_id_]->active)) {
+ full_aln.push_back(full_aln_to_remove[j]);
+ }
+ else delete full_aln_to_remove[j];
+ }
+
+ //need to do some cleaning here
+ aln_to_remove.clear();
+ full_aln_to_remove.clear();
+
+
}
+ //if (strlen(name_las) > 0) {
+ // la.resetAlignment();
+ // la.getOverlap(aln, range);
+ // la.resetAlignment();
+ // la.getAlignment(full_aln, range);
+ //}
+
if (strlen(name_paf) > 0) {
n_aln = la.loadPAF(std::string(name_paf), aln);
console->info("Load alignments from {}", name_paf);
@@ -872,7 +928,6 @@ int main(int argc, char *argv[]) {
return 1;
}
-
console->info("Input data finished");
INIReader reader(name_config);
diff --git a/src/filter/filter.cpp b/src/filter/filter.cpp
index eecc1e7..f3981a5 100644
--- a/src/filter/filter.cpp
+++ b/src/filter/filter.cpp
@@ -180,6 +180,8 @@ int main(int argc, char *argv[]) {
cmdp.add("debug", '\0', "debug mode");
cmdp.parse_check(argc, argv);
+
+
LAInterface la;
const char * name_db = cmdp.get<std::string>("db").c_str(); //.db file of reads to load
const char * name_las_base = cmdp.get<std::string>("las").c_str();//.las file of alignments
@@ -190,10 +192,46 @@ int main(int argc, char *argv[]) {
bool has_qv = true;
const char * name_restrict = cmdp.get<std::string>("restrictreads").c_str();
+ namespace spd = spdlog;
+
+ //auto console = spd::stdout_logger_mt("console",true);
+
+ std::vector<spdlog::sink_ptr> sinks;
+ sinks.push_back(std::make_shared<spdlog::sinks::stdout_sink_st>());
+ sinks.push_back(std::make_shared<spdlog::sinks::daily_file_sink_st>(cmdp.get<std::string>("log") + "/log", "txt", 23, 59));
+ auto console = std::make_shared<spdlog::logger>("log", begin(sinks), end(sinks));
+ spdlog::register_logger(console);
+ //auto console = std::make_shared<spdlog::logger>("name", begin(sinks), end(sinks));
+
+
+ console->info("Reads filtering");
+
+
+ bool db_and_las, db_or_las, fa_and_paf, fa_or_paf;
+ db_and_las = (strlen(name_db) > 0) and (strlen(name_las_base) > 0);
+ db_or_las = (strlen(name_db) > 0) or (strlen(name_las_base) > 0);
+ fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0);
+ fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0);
+
+ if (db_or_las and fa_or_paf){
+ console->error("Pass in either a db and a las or a fasta and a paf");
+ return 1;
+ }
+
+ if (( not fa_and_paf) and (not db_and_las)){
+ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf");
+ return 1;
+ }
+
std::string name_las_string;
- if (cmdp.exist("mlas"))
- name_las_string = std::string(name_las_base);
- else {
+ if (cmdp.exist("mlas")) {
+ if (not db_and_las){
+ console->error("--mlas works only with db and las");
+ return 1;
+ }
+ name_las_string = std::string(name_las_base);
+ }
+ else if (strlen(name_las_base) > 0) {
if (lastN(std::string(name_las_base), 4) == ".las")
name_las_string = std::string(name_las_base);
else
@@ -208,25 +246,15 @@ int main(int argc, char *argv[]) {
* the other is fasta + paf, which corresponds to minimap as an overlapper.
*/
- namespace spd = spdlog;
- //auto console = spd::stdout_logger_mt("console",true);
-
- std::vector<spdlog::sink_ptr> sinks;
- sinks.push_back(std::make_shared<spdlog::sinks::stdout_sink_st>());
- sinks.push_back(std::make_shared<spdlog::sinks::daily_file_sink_st>(cmdp.get<std::string>("log") + "/log", "txt", 23, 59));
- auto console = std::make_shared<spdlog::logger>("log", begin(sinks), end(sinks));
- spdlog::register_logger(console);
- //auto console = std::make_shared<spdlog::logger>("name", begin(sinks), end(sinks));
-
-
- console->info("Reads filtering");
+// std::cout << "here now " << std::endl;
console->info("name of db: {}, name of .las file {}", name_db, name_las);
console->info("name of fasta: {}, name of .paf file {}", name_fasta, name_paf);
+
std::ifstream ini_file(name_config);
std::string str((std::istreambuf_iterator<char>(ini_file)),
std::istreambuf_iterator<char>());
@@ -426,14 +454,31 @@ int main(int argc, char *argv[]) {
std::ofstream covflag(out + ".cov.flag");
std::ofstream selfflag(out + ".self.flag");
- for (int part = 0; part < name_las_list.size(); part++) {
+// std::cout << "LAS list length "<< name_las_list.size() << std::endl;
+
+// for (int ind = 0; ind < name_las_list.size() ; ind ++)
+// std::cout << "name of las: "<< name_las_list[ind] << std::endl;
+ int number_of_parts;
+ if (strlen(name_las) > 0)
+ number_of_parts = name_las_list.size();
+ else if(strlen(name_paf) > 0)
+ number_of_parts = 1;
+ else {
+ console->error("Need to provide either las and db or paf and fasta");
+ return 1;
+ }
- console->info("name of las: {}", name_las_list[part]);
+ for (int part = 0; part < number_of_parts; part++)
+ {
+ console->info("part: {}", part);
- if (strlen(name_las_list[part].c_str()) > 0)
- la.openAlignmentFile(name_las_list[part]);
+ if (strlen(name_las) > 0) {
+ console->info("name of las: {}", name_las_list[part]);
+ if (strlen(name_las_list[part].c_str()) > 0)
+ la.openAlignmentFile(name_las_list[part]);
+ }
int64 n_aln = 0;
@@ -460,7 +505,7 @@ int main(int argc, char *argv[]) {
return 1;
}
- console->info("Input data finished, part {}/{}", part + 1, name_las_list.size());
+ console->info("Input data finished, part {}/{}", part + 1, number_of_parts);
console->info("length of alignments {}", aln.size());
@@ -500,13 +545,15 @@ int main(int argc, char *argv[]) {
}
}
+
+
std::set<int> self_match_reads;
for (auto it : self_aln_list) {
float cov = 0.0;
for (int i = 0; i < it.second.size(); i++)
cov += it.second[i].second - it.second[i].first;
cov /= float(reads[it.first]->len);
- std::cout << "selfcov: " << it.first << " " << cov << " " << reads[it.first]->len << std::endl;
+// std::cout << "selfcov: " << it.first << " " << cov << " " << reads[it.first]->len << std::endl;
if ((cov > 4.5) and (reads[it.first]->len > 10000))
self_match_reads.insert(it.first);
}
@@ -564,16 +611,21 @@ int main(int argc, char *argv[]) {
cgs[i] = (cg);
}
- console->info("profile coverage done part {}/{}", part + 1, name_las_list.size());
+ console->info("profile coverage done part {}/{}", part + 1, number_of_parts);
std::set<int> rand_reads;
srand(time(NULL));
rand_reads.insert(0);
+ int temp_index(0);
while (rand_reads.size() < (r_end - r_begin)/500){
+ temp_index ++;
int rd_id=rand()%(r_end - r_begin) + r_begin;
if (reads[rd_id]->len > 5000)
rand_reads.insert(rd_id);
+
+ if (temp_index > 20000)
+ break;
}
int num_slot = 0;
@@ -775,7 +827,6 @@ int main(int argc, char *argv[]) {
}
-
// need a better hinge detection
// get hinges from repeat annotation information
@@ -1016,6 +1067,8 @@ int main(int argc, char *argv[]) {
}
}
+ console->info("reached end of loop");
+
//output hinges
int ra_cnt = 0;
@@ -1042,22 +1095,22 @@ int main(int argc, char *argv[]) {
hg << std::endl;
}
-
console->info("Number of hinges: {}", hg_cnt);
-
- for (int i = 0; i < aln.size(); i++) {
- free(aln[i]);
+ if (strlen(name_las) > 0) {
+ for (int i = 0; i < aln.size(); i++) {
+ delete aln[i];
+ }
+ aln.clear();
}
- aln.clear();
+ console->info("part: {}", part);
+// console->info("going through: {}", part+1 < name_las_list.size());
}
-
hg.close();
-
if (strlen(name_db)>0)
la.closeDB(); //close database
return 0;
diff --git a/src/include/LAInterface.h b/src/include/LAInterface.h
index 51b6884..4f1e845 100644
--- a/src/include/LAInterface.h
+++ b/src/include/LAInterface.h
@@ -76,6 +76,9 @@ public:
class LOverlap { // LOverlap is a simplified version of LAlignment, no trace
public:
LOverlap() { };
+
+ ~LOverlap() {free(trace_pts); };
+
void show() {printf("%d %d %d [%d...%d]/%d x [%d...%d]/%d %d diffs, %d type\n", read_A_id_, read_B_id_,
reverse_complement_match_,
read_A_match_start_, read_A_match_end_, alen, read_B_match_start_, read_B_match_end_, blen, diffs,
diff --git a/src/layout/hinging.cpp b/src/layout/hinging.cpp
index a85d18b..f4ab31a 100644
--- a/src/layout/hinging.cpp
+++ b/src/layout/hinging.cpp
@@ -344,7 +344,8 @@ void PrintOverlapToFile2(FILE * file_pointer, LOverlap * match, int hinge_pos) {
void GetAlignment ( LAInterface &la, std::vector<Read *> & reads, std::vector<std::unordered_map<int, std::vector<LOverlap *> > > & idx_ab,
std::vector<std::vector<LOverlap *>> & matches_forward, std::vector<std::vector<LOverlap *>>& matches_backward,
- int n_read, const char *name_db, const char *name_las_base, bool mult_las,
+ int n_read, const char *name_db, const char *name_las_base,
+ const char *name_paf, bool mult_las,
int ALN_THRESHOLD, int THETA, int THETA2, bool USE_TWO_MATCHES, int64 n_aln_full,
const std::shared_ptr<spdlog::logger> console,
std::string name_maximal_reads, bool KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS ){
@@ -357,14 +358,18 @@ void GetAlignment ( LAInterface &la, std::vector<Read *> & reads, std::vector<st
int64 n_rev_aln_kept_full(0);
std::string name_las_string;
console->info("Multiple las files: {}", mult_las);
+ if (strlen(name_paf) > 0)
+ console->info("Loading from paf: {}", name_paf);
- if (mult_las)
- name_las_string = std::string(name_las_base);
- else {
- if (lastN(std::string(name_las_base), 4) == ".las")
+ if (strlen(name_las_base) > 0) {
+ if (mult_las)
name_las_string = std::string(name_las_base);
- else
- name_las_string = std::string(name_las_base) + ".las";
+ else {
+ if (lastN(std::string(name_las_base), 4) == ".las")
+ name_las_string = std::string(name_las_base);
+ else
+ name_las_string = std::string(name_las_base) + ".las";
+ }
}
n_aln_full = 0;
@@ -374,12 +379,17 @@ void GetAlignment ( LAInterface &la, std::vector<Read *> & reads, std::vector<st
std::string name_las_str(name_las);
console->info("Las files: {}", name_las_str);
- if (mult_las) {
+ if (mult_las and strlen(name_las_base) > 0) {
console->info("Calling glob.");
name_las_list = glob(name_las_str);
}
- else
+ else if (strlen(name_las_base) > 0)
name_las_list.push_back(name_las_str);
+ else{
+ name_las_str = std::string(name_paf);
+ name_las_list.push_back(name_las_str);
+ }
+
console->info("number of las files: {}", name_las_list.size());
@@ -399,31 +409,50 @@ void GetAlignment ( LAInterface &la, std::vector<Read *> & reads, std::vector<st
reads[i]->active = (reads[i]->active) and (maximal_read[i]);
}
+ int number_of_parts;
+ if (strlen(name_las) > 0)
+ number_of_parts = name_las_list.size();
+ else if(strlen(name_paf) > 0)
+ number_of_parts = 1;
+ else {
+ console->error("Need to provide either las and db or paf and fasta");
+ }
- for (int part = 0; part < name_las_list.size(); part++) {
+ for (int part = 0; part < number_of_parts; part++) {
- console->info("name of las: {}", name_las_list[part]);
+ if (strlen(name_las_base) > 0) {
+ console->info("name of las: {}", name_las_list[part]);
- if (strlen(name_las_list[part].c_str()) > 0)
- la.openAlignmentFile(name_las_list[part]);
+ if (strlen(name_las_list[part].c_str()) > 0)
+ la.openAlignmentFile(name_las_list[part]);
+ }
int64 n_aln = 0;
int64 n_aln_accept = 0;
int64 n_aln_rcomp_accept = 0;
+ std::vector<LOverlap *> aln;//Vector of pointers to all alignments
+
+ if (strlen(name_las_base) > 0) {
+ if (strlen(name_las_list[part].c_str()) > 0) {
+ n_aln = la.getAlignmentNumber();
+ console->info("Load alignments from {}", name_las_list[part]);
+ console->info("# Alignments: {}", n_aln);
+ }
+ if (strlen(name_las_list[part].c_str()) > 0) {
+ la.resetAlignment();
+ la.getOverlap(aln, 0, n_read);
+ }
+ }
- if (strlen(name_las_list[part].c_str()) > 0) {
- n_aln = la.getAlignmentNumber();
- console->info("Load alignments from {}", name_las_list[part]);
+ if (strlen(name_paf) > 0){
+ n_aln = la.loadPAF(std::string(name_paf), aln);
+ console->info("Load alignments from {}", name_paf);
console->info("# Alignments: {}", n_aln);
}
- std::vector<LOverlap *> aln;//Vector of pointers to all alignments
- if (strlen(name_las_list[part].c_str()) > 0) {
- la.resetAlignment();
- la.getOverlap(aln, 0, n_read);
- }
+
int r_begin = aln.front()->read_A_id_;
int r_end = aln.back()->read_A_id_;
@@ -466,6 +495,14 @@ void GetAlignment ( LAInterface &la, std::vector<Read *> & reads, std::vector<st
n_rev_overlaps += aln[i]->reverse_complement_match_;
}
+
+ for (int i = 0; i < aln.size(); i++) {
+ if ( not ((reads[aln[i]->read_A_id_]->active) and
+ ((reads[aln[i]->read_B_id_]->active) and KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS)))
+ if (strlen(name_las_base) > 0)
+ delete aln[i];
+ }
+
console->info("kept {}/{} overlaps, {}/{} rev_overlaps in part {}/{}",n_aln_accept,
n_overlaps, n_aln_rcomp_accept,
n_rev_overlaps,
@@ -646,6 +683,7 @@ int main(int argc, char *argv[]) {
console->info("Hinging layout");
+
bool mult_las;
mult_las = cmdp.exist("mlas");
console->info("name of db: {}, name of .las file {}", name_db, name_las);
@@ -655,6 +693,30 @@ int main(int argc, char *argv[]) {
console->info("Multiple las files: {}", mult_las);
console->info("Multiple las files: {}", cmdp.exist("mlas"));
+ bool db_and_las, db_or_las, fa_and_paf, fa_or_paf;
+ db_and_las = (strlen(name_db) > 0) and (strlen(name_las) > 0);
+ db_or_las = (strlen(name_db) > 0) or (strlen(name_las) > 0);
+ fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0);
+ fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0);
+
+ if (db_or_las and fa_or_paf){
+ console->error("Pass in either a db and a las or a fasta and a paf");
+ return 1;
+ }
+
+ if (( not fa_and_paf) and (not db_and_las)){
+ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf");
+ return 1;
+ }
+
+ if (cmdp.exist("mlas")) {
+ if (not db_and_las) {
+ console->error("--mlas works only with db and las");
+ return 1;
+ }
+ }
+
+
std::ifstream ini_file(name_config);
std::string str((std::istreambuf_iterator<char>(ini_file)),
std::istreambuf_iterator<char>());
@@ -919,7 +981,8 @@ int main(int argc, char *argv[]) {
GetAlignment ( la, reads, idx_ab, matches_forward, matches_backward,
- n_read, name_db, name_las, mult_las, ALN_THRESHOLD, THETA, THETA2,
+ n_read, name_db, name_las, name_paf,
+ mult_las, ALN_THRESHOLD, THETA, THETA2,
USE_TWO_MATCHES, n_aln, console, name_max,
KEEP_ONLY_MATCHES_BETWEEN_MAXIMAL_READS);
diff --git a/src/maximal/maximal.cpp b/src/maximal/maximal.cpp
index 7850fc5..38a0f53 100644
--- a/src/maximal/maximal.cpp
+++ b/src/maximal/maximal.cpp
@@ -261,10 +261,45 @@ int main(int argc, char *argv[]) {
const char * name_restrict = cmdp.get<std::string>("restrictreads").c_str();
std::string name_mask = out + ".mas";
+ namespace spd = spdlog;
+
+ //auto console = spd::stdout_logger_mt("console",true);
+
+ std::vector<spdlog::sink_ptr> sinks;
+ sinks.push_back(std::make_shared<spdlog::sinks::stdout_sink_st>());
+ sinks.push_back(std::make_shared<spdlog::sinks::daily_file_sink_st>(cmdp.get<std::string>("log") + "/log", "txt", 23, 59));
+ auto console = std::make_shared<spdlog::logger>("log", begin(sinks), end(sinks));
+ spdlog::register_logger(console);
+ //auto console = std::make_shared<spdlog::logger>("name", begin(sinks), end(sinks));
+
+
+ console->info("Getting maximal reads");
+
+ bool db_and_las, db_or_las, fa_and_paf, fa_or_paf;
+ db_and_las = (strlen(name_db) > 0) and (strlen(name_las_base) > 0);
+ db_or_las = (strlen(name_db) > 0) or (strlen(name_las_base) > 0);
+ fa_and_paf = (strlen(name_fasta) > 0) and (strlen(name_paf) > 0);
+ fa_or_paf = (strlen(name_fasta) > 0) or (strlen(name_paf) > 0);
+
+ if (db_or_las and fa_or_paf){
+ console->error("Pass in either a db and a las or a fasta and a paf");
+ return 1;
+ }
+
+ if (( not fa_and_paf) and (not db_and_las)){
+ console->error("Pass in at least one of the following two combinations: a db and a las or a fasta and a paf");
+ return 1;
+ }
+
std::string name_las_string;
- if (cmdp.exist("mlas"))
- name_las_string = std::string(name_las_base);
- else {
+ if (cmdp.exist("mlas")) {
+ if (not db_and_las){
+ console->error("--mlas works only with db and las");
+ return 1;
+ }
+ name_las_string = std::string(name_las_base);
+ }
+ else if (strlen(name_las_base) > 0) {
if (lastN(std::string(name_las_base), 4) == ".las")
name_las_string = std::string(name_las_base);
else
@@ -278,19 +313,7 @@ int main(int argc, char *argv[]) {
* the other is fasta + paf, which corresponds to minimap as an overlapper.
*/
- namespace spd = spdlog;
- //auto console = spd::stdout_logger_mt("console",true);
-
- std::vector<spdlog::sink_ptr> sinks;
- sinks.push_back(std::make_shared<spdlog::sinks::stdout_sink_st>());
- sinks.push_back(std::make_shared<spdlog::sinks::daily_file_sink_st>(cmdp.get<std::string>("log") + "/log", "txt", 23, 59));
- auto console = std::make_shared<spdlog::logger>("log", begin(sinks), end(sinks));
- spdlog::register_logger(console);
- //auto console = std::make_shared<spdlog::logger>("name", begin(sinks), end(sinks));
-
-
- console->info("Getting maximal reads");
console->info("name of db: {}, name of .las file {}", name_db, name_las);
@@ -522,7 +545,17 @@ int main(int argc, char *argv[]) {
}
console->info("active reads after correcting for read lengths: {}", num_active_read);
- console->info("number of las files: {}", name_las_list.size());
+ int number_of_parts;
+ if (strlen(name_las) > 0)
+ number_of_parts = name_las_list.size();
+ else if(strlen(name_paf) > 0)
+ number_of_parts = 1;
+ else {
+ console->error("Need to provide either las and db or paf and fasta");
+ return 1;
+ }
+
+ console->info("number of las files: {}", number_of_parts);
for (int part = 0; part < name_las_list.size(); part++) {
@@ -530,22 +563,27 @@ int main(int argc, char *argv[]) {
console->info("name of las: {}", name_las_list[part]);
- if (strlen(name_las_list[part].c_str()) > 0)
- la.openAlignmentFile(name_las_list[part]);
+
int64 n_aln = 0;
- if (strlen(name_las_list[part].c_str()) > 0) {
- n_aln = la.getAlignmentNumber();
- console->info("Load alignments from {}", name_las_list[part]);
- console->info("# Alignments: {}", n_aln);
+ if(strlen(name_las_base)> 0) {
+
+ if (strlen(name_las_list[part].c_str()) > 0)
+ la.openAlignmentFile(name_las_list[part]);
+ if (strlen(name_las_list[part].c_str()) > 0) {
+ n_aln = la.getAlignmentNumber();
+ console->info("Load alignments from {}", name_las_list[part]);
+ console->info("# Alignments: {}", n_aln);
+ }
+ if (strlen(name_las_list[part].c_str()) > 0) {
+ la.resetAlignment();
+ la.getOverlap(aln, 0, n_read);
+ }
}
- if (strlen(name_las_list[part].c_str()) > 0) {
- la.resetAlignment();
- la.getOverlap(aln, 0, n_read);
- }
+
if (strlen(name_paf) > 0) {
n_aln = la.loadPAF(std::string(name_paf), aln);
@@ -558,7 +596,7 @@ int main(int argc, char *argv[]) {
return 1;
}
- console->info("Input data finished, part {}/{}", part + 1, name_las_list.size());
+ console->info("Input data finished, part {}/{}", part + 1, number_of_parts);
@@ -644,16 +682,20 @@ int main(int argc, char *argv[]) {
cgs[i] = (cg);
}
- console->info("profile coverage done part {}/{}", part + 1, name_las_list.size());
+ console->info("profile coverage done part {}/{}", part + 1, number_of_parts);
std::set<int> rand_reads;
srand(time(NULL));
rand_reads.insert(0);
+ int temp_index(0);
while (rand_reads.size() < (r_end - r_begin)/500){
+ temp_index ++;
int rd_id=rand()%(r_end - r_begin) + r_begin;
if (reads[rd_id]->len > 5000)
rand_reads.insert(rd_id);
+ if (temp_index > 20000)
+ break;
}
int num_slot = 0;
@@ -841,11 +883,12 @@ int main(int argc, char *argv[]) {
console->info("active reads: {}", num_active_read);
console->info("total reads: {}", r_end-r_begin+1);
-
- for (int i = 0; i < aln.size(); i++) {
- free(aln[i]);
+ if (strlen(name_las) > 0) {
+ for (int i = 0; i < aln.size(); i++) {
+ delete aln[i];
+ }
+ aln.clear();
}
- aln.clear();
}
diff --git a/utils/update.sh b/utils/update.sh
index 1fa5e68..a68a399 100755
--- a/utils/update.sh
+++ b/utils/update.sh
@@ -17,7 +17,7 @@ then rsync -rizP --delete --exclude '.*' --exclude 'build' $1 at shannon.stanford.e
fi
if [ "$2" == "update" ];
-then ssh -t $1 at shannon.stanford.edu "cd /home/$1/AwesomeAssembler && ./utils/build.sh"
+then ssh -t $1 at shannon.stanford.edu "export TEMP=/home/$1/tmp && cd /home/$1/AwesomeAssembler && ./utils/build.sh"
fi
if [ "$2" == "all" ];
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/hinge.git
More information about the debian-med-commit
mailing list