[med-svn] [Git][med-team/malt][upstream] New upstream version 0.0+git20180707.42b5cfd
Andreas Tille
gitlab at salsa.debian.org
Sat Aug 4 06:40:33 BST 2018
Andreas Tille pushed to branch upstream at Debian Med / malt
Commits:
d3e6b2a1 by Andreas Tille at 2018-08-04T03:59:52Z
New upstream version 0.0+git20180707.42b5cfd
- - - - -
15 changed files:
- src/malt/AlignmentEngine.java
- src/malt/MaltBuild.java
- src/malt/MaltRun.java
- src/malt/Version.java
- src/malt/data/ReferencesDBBuilder.java
- src/malt/genes/GeneItem.java
- src/malt/genes/GeneTableAccess.java → src/malt/genes/GeneItemAccessor.java
- + src/malt/genes/GeneItemCreator.java
- − src/malt/genes/GeneTableBuilder.java
- − src/malt/tools/AAddBuild.java
- − src/malt/tools/AAddRun.java
- tex/manual/manual.pdf
- tex/manual/versioninfo.tex
- tex/manual/versioninfo_new.tex
- tex/manual/versioninfo_old.tex
Changes:
=====================================
src/malt/AlignmentEngine.java
=====================================
--- a/src/malt/AlignmentEngine.java
+++ b/src/malt/AlignmentEngine.java
@@ -23,7 +23,7 @@ import jloda.util.Basic;
import malt.align.AlignerOptions;
import malt.align.BandedAligner;
import malt.data.*;
-import malt.genes.GeneTableAccess;
+import malt.genes.GeneItemAccessor;
import malt.io.*;
import malt.util.FixedSizePriorityQueue;
import malt.util.Utilities;
@@ -59,7 +59,7 @@ public class AlignmentEngine {
private final FileWriterRanked unalignedReadsWriter;
private final RMA6Writer rmaWriter;
- private final GeneTableAccess geneTableAccess;
+ private final GeneItemAccessor geneTableAccess;
// parameters
private final double minRawScore;
@@ -100,7 +100,7 @@ public class AlignmentEngine {
AlignmentEngine(final int threadNumber, final MaltOptions maltOptions, AlignerOptions alignerOptions, final ReferencesDBAccess referencesDB,
final ReferencesHashTableAccess[] tables, final FastAReader fastAReader,
final FileWriterRanked matchesWriter, final RMA6Writer rmaWriter,
- final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter, final GeneTableAccess geneTableAccess) throws IOException {
+ final FileWriterRanked alignedReadsWriter, final FileWriterRanked unalignedReadsWriter, final GeneItemAccessor geneTableAccess) throws IOException {
this.threadNumber = threadNumber;
this.maltOptions = maltOptions;
this.referencesDB = referencesDB;
@@ -356,7 +356,7 @@ public class AlignmentEngine {
}
int end = aligner.getEndReference();
referenceHeader = geneTableAccess.annotateRefString(Basic.toString(referencesDB.getHeader(refIndex)), refIndex, start, end).getBytes();
- // System.err.println(Basic.toString(referenceHeader));
+ //System.err.println(Basic.toString(referenceHeader));
}
byte[] text = null;
=====================================
src/malt/MaltBuild.java
=====================================
--- a/src/malt/MaltBuild.java
+++ b/src/malt/MaltBuild.java
@@ -21,13 +21,16 @@ package malt;
import jloda.util.*;
import malt.data.*;
-import malt.genes.GeneTableBuilder;
+import malt.genes.GeneItem;
+import malt.genes.GeneItemCreator;
import malt.mapping.Mapping;
import malt.util.Utilities;
import megan.classification.Classification;
import megan.classification.ClassificationManager;
import megan.classification.IdMapper;
import megan.classification.IdParser;
+import megan.tools.AAdderBuild;
+import megan.util.interval.Interval;
import java.io.File;
import java.io.IOException;
@@ -38,6 +41,8 @@ import java.util.*;
* Daniel Huson, 8.2014
*/
public class MaltBuild {
+ final public static String INDEX_CREATOR = "MALT";
+
/**
* run the program
*
@@ -110,24 +115,30 @@ public class MaltBuild {
proteinReduction = "";
options.comment("Classification:");
- final boolean parseTaxonNames = true;
+ final boolean parseTaxonNames = options.getOption("-tn", "parseTaxonNames", "Parse taxon names", true);
+ final String gi2TaxaFile = options.getOption("-g2t", "gi2taxa", "GI-to-Taxonomy mapping file", "");
+ final String acc2TaxaFile = options.getOption("-a2t", "acc2taxa", "Accession-to-Taxonomy mapping file", "");
+ final String synonyms2TaxaFile = options.getOption("-s2t", "syn2taxa", "Synonyms-to-Taxonomy mapping file", "");
- final Map<String, String> cName2GIFileName = new HashMap<>();
- final Map<String, String> cName2AcessionFileName = new HashMap<>();
- final Map<String, String> cName2SynonymsFileName = new HashMap<>();
+ final HashMap<String, String> class2GIFile = new HashMap<>();
+ final HashMap<String, String> class2AccessionFile = new HashMap<>();
+ final HashMap<String, String> class2SynonymsFile = new HashMap<>();
final Set<String> classificationsToUse = new TreeSet<>();
- for (String cName : ClassificationManager.getAllSupportedClassifications()) {
- cName2GIFileName.put(cName, options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file (deprecated)", ""));
- cName2AcessionFileName.put(cName, options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", ""));
- cName2SynonymsFileName.put(cName, options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", ""));
-
- if (cName2GIFileName.get(cName).length() > 0 || cName2AcessionFileName.get(cName).length() > 0 || cName2SynonymsFileName.get(cName).length() > 0)
+ for (String cName : ClassificationManager.getAllSupportedClassificationsExcludingNCBITaxonomy()) {
+ class2GIFile.put(cName, options.getOption("-g2" + cName.toLowerCase(), "gi2" + cName.toLowerCase(), "GI-to-" + cName + " mapping file", ""));
+ class2AccessionFile.put(cName, options.getOption("-a2" + cName.toLowerCase(), "acc2" + cName.toLowerCase(), "Accession-to-" + cName + " mapping file", ""));
+ class2SynonymsFile.put(cName, options.getOption("-s2" + cName.toLowerCase(), "syn2" + cName.toLowerCase(), "Synonyms-to-" + cName + " mapping file", ""));
+ final String tags = options.getOption("-t4" + cName.toLowerCase(), "tags4" + cName.toLowerCase(), "Tags for " + cName + " id parsing (must set to activate id parsing)", "").trim();
+ if (tags.length() > 0)
+ ProgramProperties.put(cName + "Tags", tags);
+ ProgramProperties.put(cName + "ParseIds", tags.length() > 0);
+ // final boolean useLCA = options.getOption("-l_" + cName.toLowerCase(), "lca" + cName.toLowerCase(), "Use LCA for assigning to '" + cName + "', alternative: best hit", ProgramProperties.get(cName + "UseLCA", cName.equals(Classification.Taxonomy)));
+ // ProgramProperties.put(cName + "UseLCA", useLCA);
+
+ if (class2GIFile.get(cName).length() > 0 || class2AccessionFile.get(cName).length() > 0 || class2AccessionFile.get(cName).length() > 0)
classificationsToUse.add(cName);
-
- if (cName.equalsIgnoreCase(Classification.Taxonomy))
- options.getOption("-tn", "parseTaxonNames", "Parse taxon names", true);
}
final boolean functionalClassification = !options.getOption("-nf", "noFun", "Turn off functional classifications for provided mapping files (set this when using GFF files for DNA references)", false);
@@ -139,10 +150,11 @@ public class MaltBuild {
final boolean saveFirstWordOfReferenceHeaderOnly = options.getOption("-fwo", "firstWordOnly", "Save only first word of reference header", false);
final int randomSeed = options.getOption("rns", "random", "Random number generator seed", 666);
final float hashTableLoadFactor = options.getOption("hsf", "hashScaleFactor", "Hash table scale factor", 0.9f, 0.1f, 1.0f);
- //final boolean buildTableInMemory = options.getOption("btm", "buildTableInMemory", "Build the hash table in memory and then save (more memory, much faster)", true);
- final boolean buildTableInMemory = true; // don't make this an option because it is really slow...
+ final boolean buildTableInMemory = options.getOption("btm", "buildTableInMemory", "Build the hash table in memory and then save (uses more memory, is much faster)", true);
final boolean doBuildTables = !options.getOption("!xX", "xSkipTable", "Don't recompute index and tables, just compute profile support", false);
+ final boolean lookInside = options.getOption("-ex", "extraStrict", "When given an input directory, look inside every GFF file to check that it is indeed in GFF3 format", false);
+
options.done();
Basic.setDebugMode(options.isVerbose());
@@ -164,20 +176,7 @@ public class MaltBuild {
}
}
- if (gffFiles.size() == 1) {
- final File file = new File(gffFiles.get(0));
- if (file.isDirectory()) {
- System.err.println("Looking for GFF files in directory: " + file);
- gffFiles.clear();
- for (File aFile : Basic.getAllFilesInDirectory(file, new GFF3FileFilter(), true)) {
- gffFiles.add(aFile.getPath());
- }
- if (gffFiles.size() == 0)
- throw new IOException("No GFF files found in directory: " + file);
- else
- System.err.println(String.format("Found: %,d", gffFiles.size()));
- }
- }
+ AAdderBuild.setupGFFFiles(gffFiles, lookInside);
System.err.println("Reference sequence type set to: " + sequenceType.toString());
final IAlphabet referenceAlphabet;
@@ -240,7 +239,9 @@ public class MaltBuild {
}
}
- // setup classification support
+ if (gi2TaxaFile.length() > 0 || acc2TaxaFile.length() > 0 || synonyms2TaxaFile.length() > 0)
+ classificationsToUse.add(Classification.Taxonomy);
+
for (String cName : classificationsToUse) {
final String cNameLowerCase = cName.toLowerCase();
final String sourceName = (cName.equals(Classification.Taxonomy) ? "ncbi" : cNameLowerCase);
@@ -250,12 +251,12 @@ public class MaltBuild {
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".tre"), new File(indexDirectory, cNameLowerCase + ".tre"));
Basic.writeStreamToFile(ResourceManager.getFileAsStream(sourceName + ".map"), new File(indexDirectory, cNameLowerCase + ".map"));
- if (cName2SynonymsFileName.get(cName).length() > 0)
- Utilities.loadMapping(cName2SynonymsFileName.get(cName), IdMapper.MapType.Synonyms, cName);
- if (cName2AcessionFileName.get(cName).length() > 0)
- Utilities.loadMapping(cName2AcessionFileName.get(cName), IdMapper.MapType.Accession, cName);
- if (cName2GIFileName.get(cName).length() > 0)
- Utilities.loadMapping(cName2GIFileName.get(cName), IdMapper.MapType.GI, cName);
+ if (class2SynonymsFile.get(cName) != null)
+ Utilities.loadMapping(class2SynonymsFile.get(cName), IdMapper.MapType.Synonyms, cName);
+ if (class2AccessionFile.get(cName) != null)
+ Utilities.loadMapping(class2AccessionFile.get(cName), IdMapper.MapType.Accession, cName);
+ if (class2GIFile.get(cName) != null)
+ Utilities.loadMapping(class2GIFile.get(cName), IdMapper.MapType.GI, cName);
final IdParser idParser = ClassificationManager.get(cName, true).getIdMapper().createIdParser();
if (cName.equals(Classification.Taxonomy))
@@ -271,8 +272,13 @@ public class MaltBuild {
referencesDB.save(new File(indexDirectory, "ref.idx"), new File(indexDirectory, "ref.db"), new File(indexDirectory, "ref.inf"), saveFirstWordOfReferenceHeaderOnly);
if (gffFiles.size() > 0) {
- GeneTableBuilder geneTableBuilder = new GeneTableBuilder();
- geneTableBuilder.buildAndSaveAnnotations(referencesDB, gffFiles, new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"), numberOfThreads);
+ // setup gene item creator, in particular accession mapping
+ final GeneItemCreator creator = AAdderBuild.setupCreator(null, class2AccessionFile);
+
+ // obtains the gene annotations:
+ Map<String, ArrayList<Interval<GeneItem>>> dnaId2list = AAdderBuild.computeAnnotations(creator, gffFiles);
+
+ AAdderBuild.saveIndex(INDEX_CREATOR, creator, indexDirectory.getPath(), dnaId2list, referencesDB.refNames());
}
}
}
=====================================
src/malt/MaltRun.java
=====================================
--- a/src/malt/MaltRun.java
+++ b/src/malt/MaltRun.java
@@ -25,7 +25,7 @@ import malt.align.BlastStatisticsHelper;
import malt.align.DNAScoringMatrix;
import malt.align.ProteinScoringMatrix;
import malt.data.*;
-import malt.genes.GeneTableAccess;
+import malt.genes.GeneItemAccessor;
import malt.io.*;
import malt.mapping.MappingManager;
import malt.util.Utilities;
@@ -196,7 +196,7 @@ public class MaltRun {
ReadMagnitudeParser.setEnabled(options.getOption("mag", "magnitudes", "Reads have magnitudes (to be used in taxonomic or functional analysis)", false));
- maltOptions.setContaminantsFile(ProgramProperties.getIfEnabled("enable-contaminants", options.getOption("-cf", "conFile", "File of contaminant taxa (one Id or name per line)", "")));
+ maltOptions.setContaminantsFile(options.getOption("-cf", "conFile", "File of contaminant taxa (one Id or name per line)", ""));
options.comment("Heuristics:");
maltOptions.setMaxSeedsPerOffsetPerFrame(options.getOption("spf", "maxSeedsPerFrame", "Maximum number of seed matches per offset per read frame", maltOptions.getMaxSeedsPerOffsetPerFrame()));
@@ -299,9 +299,9 @@ public class MaltRun {
MappingManager.loadMappings(cNames, indexDirectory);
}
- final GeneTableAccess geneTableAccess;
- if ((new File(indexDirectory, "annotation.idx")).exists()) {
- geneTableAccess = new GeneTableAccess(new File(indexDirectory, "annotation.idx"), new File(indexDirectory, "annotation.db"));
+ final GeneItemAccessor geneTableAccess;
+ if ((new File(indexDirectory, "aadd.idx")).exists()) {
+ geneTableAccess = new GeneItemAccessor(new File(indexDirectory, "aadd.idx"), new File(indexDirectory, "aadd.dbx"));
maltOptions.setParseHeaders(true);
} else
geneTableAccess = null;
@@ -356,7 +356,7 @@ public class MaltRun {
final String matchesOutputFile,
final String alignedReadsOutputFile, final String unalignedReadsOutputFile,
final ReferencesDBAccess referencesDB, final ReferencesHashTableAccess[] tables,
- final GeneTableAccess geneTableAccess) throws IOException, JAXBException {
+ final GeneItemAccessor geneTableAccess) throws IOException {
final ExecutorService executor = Executors.newFixedThreadPool(maltOptions.getNumberOfThreads());
final CountDownLatch countDownLatch = new CountDownLatch(maltOptions.getNumberOfThreads());
=====================================
src/malt/Version.java
=====================================
--- a/src/malt/Version.java
+++ b/src/malt/Version.java
@@ -25,5 +25,5 @@ package malt;
*/
public class Version {
public static final String NAME = "MALT";
- public static final String SHORT_DESCRIPTION = "MALT (version 0.4.0, built 6 Sep 2017)";
+ public static final String SHORT_DESCRIPTION = "MALT (version 0.4.1, built 29 June 2018)";
}
=====================================
src/malt/data/ReferencesDBBuilder.java
=====================================
--- a/src/malt/data/ReferencesDBBuilder.java
+++ b/src/malt/data/ReferencesDBBuilder.java
@@ -19,12 +19,14 @@
*/
package malt.data;
+import jloda.util.Basic;
import jloda.util.CanceledException;
import jloda.util.ProgressPercentage;
import malt.io.FastAFileIteratorBytes;
import megan.io.OutputWriter;
import java.io.*;
+import java.util.Iterator;
import java.util.List;
/**
@@ -105,6 +107,37 @@ public class ReferencesDBBuilder implements ISequenceAccessor {
}
/**
+ * gets an iterable over all ref names as strings
+ *
+ * @return iterable
+ */
+ public Iterable<String> refNames() {
+ return new Iterable<String>() {
+ @Override
+ public Iterator<String> iterator() {
+ return new Iterator<String>() {
+ private int i = 0;
+
+ @Override
+ public boolean hasNext() {
+ return i < numberOfSequences;
+ }
+
+ @Override
+ public String next() {
+ return Basic.getAccessionWord(headers[i++]);
+ }
+
+ @Override
+ public void remove() {
+
+ }
+ };
+ }
+ };
+ }
+
+ /**
* Get sequence. Index starts at 0
*
* @param index
@@ -259,9 +292,9 @@ public class ReferencesDBBuilder implements ISequenceAccessor {
pos++;
byte[] add;
if (header[pos - 1] == '|')
- add = String.format("%s%d|", tag, id).getBytes();
+ add = String.format("%s%d", tag, id).getBytes();
else
- add = String.format("|%s%d|", tag, id).getBytes();
+ add = String.format("|%s%d", tag, id).getBytes();
byte[] newHeader = new byte[header.length + add.length];
System.arraycopy(header, 0, newHeader, 0, pos);
=====================================
src/malt/genes/GeneItem.java
=====================================
--- a/src/malt/genes/GeneItem.java
+++ b/src/malt/genes/GeneItem.java
@@ -1,4 +1,4 @@
-/**
+/*
* GeneItem.java
* Copyright (C) 2018 Daniel H. Huson
* <p>
@@ -22,6 +22,7 @@ package malt.genes;
import jloda.util.Basic;
import megan.io.InputReader;
import megan.io.OutputWriter;
+import megan.util.interval.Interval;
import java.io.IOException;
import java.io.RandomAccessFile;
@@ -33,53 +34,39 @@ import java.io.RandomAccessFile;
*/
public class GeneItem {
private byte[] proteinId;
- private int keggId;
- private int cogId;
- private int seedId;
- private int interproId;
+ private final GeneItemCreator creator;
private boolean reverse;
- public GeneItem() {
+ private final int[] ids;
+
+ GeneItem(GeneItemCreator creator) {
+ this.creator = creator;
+ ids = new int[creator.numberOfClassifications()];
}
public byte[] getProteinId() {
return proteinId;
}
- public void setProteinId(byte[] proteinId) {
+ public void setProteinId(byte[] proteinId) throws IOException {
this.proteinId = proteinId;
+ creator.map(Basic.toString(proteinId), ids);
}
- public int getKeggId() {
- return keggId;
- }
-
- public void setKeggId(int keggId) {
- this.keggId = keggId;
- }
-
- public int getCogId() {
- return cogId;
- }
-
- public void setCogId(int cogId) {
- this.cogId = cogId;
- }
-
- public int getSeedId() {
- return seedId;
+ public int getId(String classificationName) {
+ return getId(creator.rank(classificationName));
}
- public void setSeedId(int seedId) {
- this.seedId = seedId;
+ public int getId(Integer rank) {
+ return rank == null ? 0 : ids[rank];
}
- public int getInterproId() {
- return interproId;
+ public void setId(String classificationName, int id) {
+ ids[creator.rank(classificationName)] = id;
}
- public void setInterproId(int interproId) {
- this.interproId = interproId;
+ public void setId(int rank, int id) {
+ ids[rank] = id;
}
public boolean isReverse() {
@@ -91,13 +78,13 @@ public class GeneItem {
}
public String toString() {
- return "proteinId=" + (proteinId == null ? "null" : Basic.toString(proteinId))
- + ", keggId=" + keggId
- + ", cogId=" + cogId
- + ", seedId=" + seedId
- + ", interProId=" + interproId
- + ", reverse=" + reverse;
+ final StringBuilder buf = new StringBuilder("proteinId=" + (proteinId == null ? "null" : Basic.toString(proteinId)));
+ for (int i = 0; i < creator.numberOfClassifications(); i++) {
+ buf.append(", ").append(creator.classification(i)).append("=").append(ids[i]);
+ }
+ buf.append(", reverse=").append(reverse);
+ return buf.toString();
}
/**
@@ -113,10 +100,9 @@ public class GeneItem {
outs.writeInt(proteinId.length);
outs.write(proteinId);
}
- outs.writeInt(keggId);
- outs.writeInt(cogId);
- outs.writeInt(seedId);
- outs.writeInt(interproId);
+ for (int i = 0; i < creator.numberOfClassifications(); i++) {
+ outs.writeInt(ids[i]);
+ }
outs.write(reverse ? 1 : 0);
}
@@ -135,10 +121,9 @@ public class GeneItem {
if (ins.read(proteinId, 0, length) != length)
throw new IOException("read failed");
}
- keggId = ins.readInt();
- cogId = ins.readInt();
- seedId = ins.readInt();
- interproId = ins.readInt();
+ for (int i = 0; i < creator.numberOfClassifications(); i++) {
+ ids[i] = ins.readInt();
+ }
reverse = (ins.read() == 1);
}
@@ -157,10 +142,26 @@ public class GeneItem {
if (ins.read(proteinId, 0, length) != length)
throw new IOException("read failed");
}
- keggId = ins.readInt();
- cogId = ins.readInt();
- seedId = ins.readInt();
- interproId = ins.readInt();
+ for (int i = 0; i < creator.numberOfClassifications(); i++) {
+ ids[i] = ins.readInt();
+ }
reverse = (ins.read() == 1);
}
+
+ /**
+ * get the annotation string
+ *
+ * @param refInterval
+ * @return annotation string
+ */
+ public String getAnnotation(Interval<GeneItem> refInterval) {
+ final StringBuilder buf = new StringBuilder();
+ buf.append("pos|").append(isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart() : refInterval.getStart() + ".." + refInterval.getEnd());
+ buf.append("|ref|").append(Basic.toString(getProteinId()));
+ for (int i = 0; i < creator.numberOfClassifications(); i++) {
+ if (getId(i) > 0)
+ buf.append("|").append(creator.getShortTag(i)).append(getId(i));
+ }
+ return buf.toString();
+ }
}
=====================================
src/malt/genes/GeneTableAccess.java → src/malt/genes/GeneItemAccessor.java
=====================================
--- a/src/malt/genes/GeneTableAccess.java
+++ b/src/malt/genes/GeneItemAccessor.java
@@ -1,5 +1,5 @@
-/**
- * GeneTableAccess.java
+/*
+ * GeneItemAccessor.java
* Copyright (C) 2018 Daniel H. Huson
* <p>
* (Some files contain contributions from other authors, who are then mentioned separately.)
@@ -20,21 +20,29 @@
package malt.genes;
import jloda.util.*;
+import malt.MaltBuild;
+import megan.classification.IdMapper;
+import megan.io.InputReader;
+import megan.tools.AAdderBuild;
+import megan.tools.AAdderRun;
import megan.util.interval.Interval;
import megan.util.interval.IntervalTree;
import java.io.*;
/**
- * class used to access gene table
- * Daniel Huson, 8.2014
+ * class used to access gene items
+ * Daniel Huson, 6.2018
*/
-public class GeneTableAccess {
+public class GeneItemAccessor {
private final int size;
private final long[] refIndex2FilePos;
private final IntervalTree<GeneItem>[] refIndex2Intervals;
+ private final String[] index2ref;
private final RandomAccessFile dbRaf;
+ private final GeneItemCreator creator;
+
private final int syncBits = 1023;
private final Object[] syncObjects = new Object[syncBits + 1]; // use lots of objects to synchronize on so that threads don't in each others way
@@ -45,28 +53,38 @@ public class GeneTableAccess {
* @param dbFile
* @throws IOException
*/
- public GeneTableAccess(File indexFile, File dbFile) throws IOException {
+ public GeneItemAccessor(File indexFile, File dbFile) throws IOException {
// create the synchronization objects
for (int i = 0; i < (syncBits + 1); i++) {
syncObjects[i] = new Object();
}
- try (DataInputStream ins = new DataInputStream(new BufferedInputStream(new FileInputStream(indexFile))); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
- Basic.readAndVerifyMagicNumber(ins, GeneTableBuilder.MAGIC_NUMBER_IDX);
+ try (InputReader ins = new InputReader(indexFile); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
+ AAdderRun.readAndVerifyMagicNumber(ins, AAdderBuild.MAGIC_NUMBER_IDX);
+ final String creator = ins.readString();
+ if (!creator.equals(MaltBuild.INDEX_CREATOR))
+ throw new IOException("Gene Item index not created by MALT");
size = ins.readInt();
progress.setMaximum(size);
refIndex2FilePos = new long[size];
+ index2ref = new String[size];
for (int i = 0; i < size; i++) {
- refIndex2FilePos[i] = ins.readLong();
+ index2ref[i] = ins.readString();
+ final long pos = ins.readLong();
+ refIndex2FilePos[i] = pos;
progress.incrementProgress();
}
}
refIndex2Intervals = (IntervalTree<GeneItem>[]) new IntervalTree[size];
- try (DataInputStream ins = new DataInputStream(new BufferedInputStream(new FileInputStream(dbFile)))) {
- Basic.readAndVerifyMagicNumber(ins, GeneTableBuilder.MAGIC_NUMBER_DB);
- if (ins.readInt() != size)
- throw new IOException("Sizes differ: " + indexFile + " vs " + dbFile);
+ try (InputReader dbxIns = new InputReader(dbFile)) {
+ AAdderRun.readAndVerifyMagicNumber(dbxIns, AAdderBuild.MAGIC_NUMBER_DBX);
+ final String[] cNames = new String[dbxIns.readInt()];
+ for (int i = 0; i < cNames.length; i++) {
+ cNames[i] = dbxIns.readString();
+ System.err.println(cNames[i]);
+ }
+ creator = new GeneItemCreator(cNames, new IdMapper[0]);
}
dbRaf = new RandomAccessFile(dbFile, "r");
}
@@ -85,14 +103,15 @@ public class GeneTableAccess {
if (refIndex2Intervals[refIndex] == null && refIndex2FilePos[refIndex] != 0) {
synchronized (dbRaf) {
try {
- dbRaf.seek(refIndex2FilePos[refIndex]);
+ final long pos = refIndex2FilePos[refIndex];
+ dbRaf.seek(pos);
int intervalsLength = dbRaf.readInt();
if (intervalsLength > 0) {
IntervalTree<GeneItem> intervals = new IntervalTree<>();
for (int i = 0; i < intervalsLength; i++) {
int start = dbRaf.readInt();
int end = dbRaf.readInt();
- GeneItem geneItem = new GeneItem();
+ GeneItem geneItem = new GeneItem(creator);
geneItem.read(dbRaf);
intervals.add(start, end, geneItem);
//System.err.println(refIndex+"("+start+"-"+end+") -> "+geneItem);
@@ -133,36 +152,16 @@ public class GeneTableAccess {
if (refInterval != null) {
final GeneItem geneItem = refInterval.getData();
- final StringBuilder buf = new StringBuilder();
- buf.append("|ref|").append(Basic.toString(geneItem.getProteinId()));
- if (geneItem.getKeggId() != 0)
- buf.append("|kegg|").append(geneItem.getKeggId());
- if (geneItem.getCogId() != 0)
- buf.append("|cog|").append(geneItem.getCogId());
- if (geneItem.getSeedId() != 0)
- buf.append("|seed|").append(geneItem.getSeedId());
- if (geneItem.getInterproId() != 0)
- buf.append("|ipr|").append(geneItem.getInterproId());
- if (buf.length() > 0) {
- String header = referenceHeader;
- final String remainder;
- final int len = header.indexOf(' ');
- if (len >= 0 && len < header.length()) {
- remainder = header.substring(len); // keep space...
- header = header.substring(0, len);
- } else
- remainder = "";
-
- return (header + (header.endsWith("|") ? "" : "|") + "pos|"
- + (geneItem.isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart()
- : refInterval.getStart() + ".." + refInterval.getEnd())
- + buf.toString() + remainder);
- }
+ return Basic.swallowLeadingGreaterSign(Basic.getFirstWord(referenceHeader)) + "|" + geneItem.getAnnotation(refInterval);
}
}
return referenceHeader;
}
+ public String getIndex2ref(int i) {
+ return index2ref[i];
+ }
+
public int size() {
return size;
}
@@ -173,24 +172,25 @@ public class GeneTableAccess {
* @param args
*/
public static void main(String[] args) throws IOException, UsageException, CanceledException {
- args = new String[]{"-i", "/Users/huson/data/malt/genes2/index/annotation.idx"};
+ args = new String[]{"-i", "/Users/huson/data/malt/gff/index/aadd.idx"};
final ArgsOptions options = new ArgsOptions(args, null, "GeneTableDump", "Dump gene table");
- final String idxFile = options.getOptionMandatory("i", "idxFile", "Input annotation.idx file", "index/annotation.idx");
- final String dbFile = options.getOption("d", "dbFile", "Input annotation.db file", Basic.replaceFileSuffix(idxFile, ".db"));
+ final String idxFile = options.getOptionMandatory("i", "idxFile", "Input aadd.idx file", "index/aadd.idx");
+ final String dbFile = options.getOption("d", "dbFile", "Input aadd.dbx file", Basic.replaceFileSuffix(idxFile, ".dbx"));
final String outputFile = options.getOption("o", "output", "Output file (or stdout)", "stdout");
options.done();
- final GeneTableAccess geneTableAccess = new GeneTableAccess(new File(idxFile), new File(dbFile));
+ final GeneItemAccessor geneTableAccess = new GeneItemAccessor(new File(idxFile), new File(dbFile));
try (Writer w = new BufferedWriter(outputFile.equals("stdout") ? new OutputStreamWriter(System.out) : new FileWriter(outputFile))) {
- for (int idx = 0; idx < geneTableAccess.size(); idx++) {
- final IntervalTree<GeneItem> tree = geneTableAccess.getIntervals(idx);
+ for (int i = 0; i < geneTableAccess.size(); i++) {
+ System.err.println("ref[" + i + "]=" + geneTableAccess.getIndex2ref(i) + ":");
+ final IntervalTree<GeneItem> tree = geneTableAccess.getIntervals(i);
if (tree != null) {
if (true) {
System.err.println("Tree[" + idxFile + "]: " + Basic.abbreviateDotDotDot(tree.toString(), 1000));
} else {
- w.write("RefIndex=" + idx + "\n");
+ w.write("RefIndex=" + i + "\n");
for (Interval<GeneItem> interval : tree) {
w.write(interval.getStart() + " " + interval.getEnd() + ": " + interval.getData() + "\n");
=====================================
src/malt/genes/GeneItemCreator.java
=====================================
--- /dev/null
+++ b/src/malt/genes/GeneItemCreator.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2015 Daniel H. Huson
+ *
+ * (Some files contain contributions from other authors, who are then mentioned separately.)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package malt.genes;
+
+import megan.classification.Classification;
+import megan.classification.IdMapper;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ * class for creating gene items
+ * Daniel Huson, 6.2018
+ */
+public class GeneItemCreator {
+ private final String[] cNames;
+ private final Map<String, Integer> rank;
+ private final IdMapper[] idMappers;
+ private final String[] shortTags;
+
+ /**
+ * constructor
+ *
+ * @param cNames
+ */
+ public GeneItemCreator(String[] cNames, IdMapper[] idMappers) {
+ this.cNames = cNames.clone();
+ this.idMappers = idMappers;
+ rank = new HashMap<>();
+ for (int i = 0; i < this.cNames.length; i++) {
+ rank.put(this.cNames[i], i);
+ }
+ this.shortTags = new String[this.cNames.length];
+ for (int i = 0; i < cNames.length; i++) {
+ shortTags[i] = Classification.createShortTag(cNames[i]);
+ }
+ }
+
+ public GeneItem createGeneItem() {
+ return new GeneItem(this);
+ }
+
+ public Integer rank(String classificationName) {
+ return rank.get(classificationName);
+ }
+
+ public String classification(int rank) {
+ return cNames[rank];
+ }
+
+ public int numberOfClassifications() {
+ return cNames.length;
+ }
+
+ public Iterable<String> cNames() {
+ return new Iterable<String>() {
+ @Override
+ public Iterator<String> iterator() {
+ return Arrays.asList(cNames).iterator();
+ }
+ };
+ }
+
+ /**
+ * map the given accession to ids using the id mappers
+ *
+ * @param accession
+ * @param ids
+ * @throws IOException
+ */
+ public void map(String accession, int[] ids) throws IOException {
+ for (int i = 0; i < idMappers.length; i++) {
+ ids[i] = idMappers[i].getAccessionMap().get(accession);
+ }
+ }
+
+ public String getShortTag(int i) {
+ return shortTags[i];
+ }
+}
=====================================
src/malt/genes/GeneTableBuilder.java deleted
=====================================
--- a/src/malt/genes/GeneTableBuilder.java
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (C) 2018 Daniel H. Huson
- *
- * (Some files contain contributions from other authors, who are then mentioned separately.)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-package malt.genes;
-
-import jloda.util.Basic;
-import jloda.util.CanceledException;
-import jloda.util.ProgressPercentage;
-import malt.data.ReferencesDBBuilder;
-import megan.classification.ClassificationManager;
-import megan.classification.IdMapper;
-import megan.classification.util.TaggedValueIterator;
-import megan.io.OutputWriter;
-import megan.util.interval.Interval;
-
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-
-
-/**
- * Builds a table mapping reference indices and positions to genes
- * Daniel Huson, 8.2014, 11.2017
- */
-public class GeneTableBuilder {
- final static byte[] MAGIC_NUMBER_IDX = "MAAnnoIdxV0.1.".getBytes();
- final static byte[] MAGIC_NUMBER_DB = "MAAnnoDbV0.1.".getBytes();
-
- public static final String[] ACCESSION_TAGS = new String[]{"gb|", "ref|"};
-
- private final IdMapper keggMapper;
- private final IdMapper cogMapper;
- private final IdMapper seedMapper;
- private final IdMapper interproMapper;
-
- private final int syncBits = 1023;
- private final Object[] syncObjects = new Object[syncBits + 1]; // use lots of objects to synchronize on so that threads don't in each others way
-
- /**
- * constructor
- *
- * @throws IOException
- */
- public GeneTableBuilder() throws IOException {
- // create the synchronization objects
- for (int i = 0; i < (syncBits + 1); i++) {
- syncObjects[i] = new Object();
- }
-
- if (ClassificationManager.get("KEGG", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
- keggMapper = ClassificationManager.get("KEGG", false).getIdMapper();
- else
- keggMapper = null;
- if (ClassificationManager.get("EGGNOG", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
- cogMapper = ClassificationManager.get("EGGNOG", false).getIdMapper();
- else
- cogMapper = null;
- if (ClassificationManager.get("SEED", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
- seedMapper = ClassificationManager.get("SEED", false).getIdMapper();
- else
- seedMapper = null;
-
- if (ClassificationManager.get("INTERPRO2GO", false).getIdMapper().isActiveMap(IdMapper.MapType.Accession))
- interproMapper = ClassificationManager.get("INTERPRO2GO", false).getIdMapper();
- else
- interproMapper = null;
-
- }
-
-
- /**
- * build and then save the gene table
- *
- * @param referencesDB
- * @param gffFiles
- * @param indexFile
- * @param numberOfThreads
- * @throws IOException
- */
- public void buildAndSaveAnnotations(final ReferencesDBBuilder referencesDB, final Collection<String> gffFiles, final File indexFile, final File dbFile, final int numberOfThreads) throws IOException, CanceledException {
- System.err.println("Annotating reference sequences...");
- final Map<String, Integer> refAccession2IndexMap = computeRefAccession2IndexMap(referencesDB, numberOfThreads);
-
- final Collection<CDS> annotations = CDS.parseGFFforCDS(gffFiles, new ProgressPercentage());
-
- final ArrayList<Interval<GeneItem>>[] table = computeRefIndex2Intervals(referencesDB, refAccession2IndexMap, annotations, numberOfThreads);
- refAccession2IndexMap.clear();
-
- writeTable(indexFile, dbFile, table);
- }
-
- /**
- * Compute the reference accessions to reference index mapping
- *
- * @param referencesDB
- * @param numberOfThreads
- * @return accession to reference index mapping
- */
- private Map<String, Integer> computeRefAccession2IndexMap(final ReferencesDBBuilder referencesDB, final int numberOfThreads) {
- final Map<String, Integer> accession2refIndex = new HashMap<>(referencesDB.getNumberOfSequences(), 1f);
-
- final ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
- final CountDownLatch countDownLatch = new CountDownLatch(numberOfThreads);
-
- final ProgressPercentage progress = new ProgressPercentage("Mapping accessions to references...", referencesDB.getNumberOfSequences());
-
- // launch the worker threads
- for (int thread = 0; thread < numberOfThreads; thread++) {
- final int threadNumber = thread;
- executor.execute(new Runnable() {
- public void run() {
- try {
- final TaggedValueIterator it = new TaggedValueIterator(true, true, ACCESSION_TAGS);
- for (int refIndex = threadNumber; refIndex < referencesDB.getNumberOfSequences(); refIndex += numberOfThreads) {
- it.restart(Basic.toString(referencesDB.getHeader(refIndex)));
- while (it.hasNext()) {
- synchronized (accession2refIndex) {
- accession2refIndex.put(it.next(), refIndex);
- }
- }
- progress.incrementProgress();
- }
- } catch (Exception ex) {
- Basic.caught(ex);
- System.exit(1); // just die...
- } finally {
- countDownLatch.countDown();
- }
- }
- });
- }
-
- try {
- countDownLatch.await(); // await completion of alignment threads
- } catch (InterruptedException e) {
- Basic.caught(e);
- } finally {
- // shut down threads:
- executor.shutdownNow();
- }
- progress.close();
- return accession2refIndex;
- }
-
- /**
- * compute the reference Id to intervals mapping
- *
- * @param referencesDB
- * @param refAccession2IdMap
- * @param cdsList
- * @param numberOfThreads
- * @return
- * @throws FileNotFoundException
- */
- private ArrayList<Interval<GeneItem>>[] computeRefIndex2Intervals(final ReferencesDBBuilder referencesDB, final Map<String, Integer> refAccession2IdMap, final Collection<CDS> cdsList, int numberOfThreads) throws IOException {
- final ArrayList<Interval<GeneItem>>[] refIndex2Intervals = new ArrayList[referencesDB.getNumberOfSequences()];
-
- final ExecutorService executor = Executors.newFixedThreadPool(numberOfThreads);
- final CountDownLatch countDownLatch = new CountDownLatch(numberOfThreads);
-
- final ArrayBlockingQueue<CDS> queue = new ArrayBlockingQueue<>(10 * numberOfThreads);
- final CDS sentinel = new CDS();
-
- final int[] counts = new int[numberOfThreads];
-
- // launch the worker threads
- for (int thread = 0; thread < numberOfThreads; thread++) {
- final int threadNumber = thread;
- executor.execute(new Runnable() {
- public void run() {
- try {
- while (true) {
- final CDS cds = queue.take();
- if (cds == sentinel)
- break;
- final Integer refIndex = refAccession2IdMap.get(cds.getDnaId());
- if (refIndex != null) {
- synchronized (syncObjects[refIndex & syncBits]) {
- ArrayList<Interval<GeneItem>> list = refIndex2Intervals[refIndex];
- if (list == null)
- list = refIndex2Intervals[refIndex] = new ArrayList<>();
-
- final GeneItem geneItem = new GeneItem();
- final String accession = cds.getProteinId();
- geneItem.setProteinId(accession.getBytes());
- if (keggMapper != null) {
- final Integer id = keggMapper.getIdFromAccession(accession);
- if (id != null && id != 0) {
- geneItem.setKeggId(id);
- }
- }
- // set cog:
- if (cogMapper != null) {
- final Integer id = cogMapper.getIdFromAccession(accession);
- if (id != null && id != 0) {
- geneItem.setCogId(id);
- }
- }
- // set seed:
- if (seedMapper != null) {
- final Integer id = seedMapper.getIdFromAccession(accession);
- if (id != null && id != 0) {
- geneItem.setSeedId(id);
- }
- }
- // set interpro:
- if (interproMapper != null) {
- final Integer id = interproMapper.getIdFromAccession(accession);
- if (id != null && id != 0) {
- geneItem.setInterproId(id);
- }
- }
- geneItem.setReverse(cds.isReverse());
- list.add(new Interval<>(cds.getStart(), cds.getEnd(), geneItem));
- counts[threadNumber]++;
- }
- }
-
- }
- } catch (Exception ex) {
- Basic.caught(ex);
- System.exit(1); // just die...
- } finally {
- countDownLatch.countDown();
- }
- }
- });
- }
- try (ProgressPercentage progress = new ProgressPercentage("Annotating references", cdsList.size() + numberOfThreads)) {
- for (CDS cds : cdsList) {
- queue.put(cds);
- progress.incrementProgress();
- }
- for (int i = 0; i < numberOfThreads; i++) {
- queue.put(sentinel);
- progress.incrementProgress();
-
- }
- countDownLatch.await();
- } catch (InterruptedException ex) {
- Basic.caught(ex);
- } finally {
- executor.shutdownNow();
- }
- System.err.println(String.format("Count:%,14d", Basic.getSum(counts)));
- return refIndex2Intervals;
- }
-
- /**
- * save the annotations
- *
- * @param indexFile
- * @param refIndex2Intervals
- * @throws IOException
- */
- private static void writeTable(File indexFile, File dbFile, final ArrayList<Interval<GeneItem>>[] refIndex2Intervals) throws IOException {
- final long[] refIndex2FilePos = new long[refIndex2Intervals.length];
-
- int totalRefWithAGene = 0;
- try (OutputWriter outs = new OutputWriter(dbFile); ProgressPercentage progress = new ProgressPercentage("Writing file: " + dbFile, refIndex2Intervals.length)) {
- outs.write(GeneTableBuilder.MAGIC_NUMBER_DB);
-
- outs.writeInt(refIndex2Intervals.length);
-
- for (int i = 0; i < refIndex2Intervals.length; i++) {
- final ArrayList<Interval<GeneItem>> list = refIndex2Intervals[i];
- if (list == null) {
- refIndex2FilePos[i] = 0;
- outs.writeInt(0);
- } else {
- refIndex2FilePos[i] = outs.length();
- outs.writeInt(list.size());
- for (Interval<GeneItem> interval : Basic.randomize(list, 666)) { // need to save in random order
- outs.writeInt(interval.getStart());
- outs.writeInt(interval.getEnd());
- interval.getData().write(outs);
- }
- totalRefWithAGene++;
- }
- progress.incrementProgress();
- }
- }
- try (OutputWriter outs = new OutputWriter(indexFile); ProgressPercentage progress = new ProgressPercentage("Writing file: " + indexFile, refIndex2FilePos.length)) {
- outs.write(GeneTableBuilder.MAGIC_NUMBER_IDX);
- outs.writeInt(refIndex2FilePos.length);
- for (long filePos : refIndex2FilePos) {
- outs.writeLong(filePos);
- progress.incrementProgress();
- }
- }
-
- System.err.println(String.format("Reference sequences with at least one annotation: %,d of %,d", totalRefWithAGene, refIndex2Intervals.length));
- }
-}
-
-
=====================================
src/malt/tools/AAddBuild.java deleted
=====================================
--- a/src/malt/tools/AAddBuild.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (C) 2015 Daniel H. Huson
- *
- * (Some files contain contributions from other authors, who are then mentioned separately.)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-package malt.tools;
-
-import jloda.util.*;
-import malt.genes.CDS;
-import malt.genes.GeneItem;
-import megan.io.OutputWriter;
-import megan.util.interval.Interval;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.*;
-
-/**
- * build the aadd index
- * Daniel Huson, 5.2018
- */
-public class AAddBuild {
- final static byte[] MAGIC_NUMBER_IDX = "AAddIdxV0.1.".getBytes();
- final static byte[] MAGIC_NUMBER_DBX = "AAddDbxV0.1.".getBytes();
-
- /**
- * add functional annotations to DNA alignments
- */
- public static void main(String[] args) {
- try {
- ProgramProperties.setProgramName("AAddBuild");
- ProgramProperties.setProgramVersion(megan.main.Version.SHORT_DESCRIPTION);
-
- PeakMemoryUsageMonitor.start();
- (new AAddBuild()).run(args);
- System.err.println("Total time: " + PeakMemoryUsageMonitor.getSecondsSinceStartString());
- System.err.println("Peak memory: " + PeakMemoryUsageMonitor.getPeakUsageString());
- System.exit(0);
- } catch (Exception ex) {
- Basic.caught(ex);
- System.exit(1);
- }
- }
-
- /**
- * run the program
- */
- public void run(String[] args) throws CanceledException, IOException, UsageException {
- final ArgsOptions options = new ArgsOptions(args, this, "Build the index for AAdd");
- options.setVersion(ProgramProperties.getProgramVersion());
- options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
- options.setAuthors("Daniel H. Huson");
-
- options.comment("Input Output");
- final List<String> gffFiles = options.getOptionMandatory("-igff", "inputGFF", "Input GFF3 files or directory (.gz ok)", new LinkedList<String>());
- final String indexDirectory = options.getOptionMandatory("-d", "index", "Index directory", "");
- options.comment(ArgsOptions.OTHER);
- final boolean lookInside = options.getOption("-ex", "extraStrict", "When given an input directory, look inside every input file to check that it is indeed in GFF3 format", false);
- options.done();
-
- if (gffFiles.size() == 1) {
- final File file = new File(gffFiles.get(0));
- if (file.isDirectory()) {
- System.err.println("Collecting all GFF3 files in directory: " + file);
- gffFiles.clear();
- for (File aFile : Basic.getAllFilesInDirectory(file, new GFF3FileFilter(true, lookInside), true)) {
- gffFiles.add(aFile.getPath());
- }
- if (gffFiles.size() == 0)
- throw new IOException("No GFF files found in directory: " + file);
- else
- System.err.println(String.format("Found: %,d", gffFiles.size()));
- }
- }
-
-
- // obtains the gene annotations:
- Map<String, ArrayList<Interval<GeneItem>>> dnaId2list = new HashMap<>();
- {
- final Collection<CDS> annotations = CDS.parseGFFforCDS(gffFiles, new ProgressPercentage("Processing GFF files"));
-
- try (ProgressListener progress = new ProgressPercentage("Building annotation list", annotations.size())) {
- for (CDS cds : annotations) {
- ArrayList<Interval<GeneItem>> list = dnaId2list.get(cds.getDnaId());
- if (list == null) {
- list = new ArrayList<>();
- dnaId2list.put(cds.getDnaId(), list);
- }
- final GeneItem geneItem = new GeneItem();
- final String accession = cds.getProteinId();
- geneItem.setProteinId(accession.getBytes());
- geneItem.setReverse(cds.isReverse());
- list.add(new Interval<>(cds.getStart(), cds.getEnd(), geneItem));
- progress.incrementProgress();
- }
- }
- }
-
- // writes the index file:
- long totalRefWithAGene = 0;
-
- final File indexFile = new File(indexDirectory, "aadd.idx");
- final File dbFile = new File(indexDirectory, "aadd.dbx");
- try (OutputWriter idxWriter = new OutputWriter(indexFile); OutputWriter dbxWriter = new OutputWriter(dbFile);
- ProgressPercentage progress = new ProgressPercentage("Writing files: " + indexFile + "\n " + dbFile, dnaId2list.size())) {
- idxWriter.write(MAGIC_NUMBER_IDX);
- dbxWriter.write(MAGIC_NUMBER_DBX);
-
- idxWriter.writeInt(dnaId2list.size());
-
- for (String dnaId : dnaId2list.keySet()) {
- idxWriter.writeString(dnaId);
- final ArrayList<Interval<GeneItem>> list = dnaId2list.get(dnaId);
- if (list == null) {
- idxWriter.writeLong(0); // no intervals
- } else {
- idxWriter.writeLong(dbxWriter.getPosition()); // position of intervals in DB file
-
- dbxWriter.writeInt(list.size());
- for (Interval<GeneItem> interval : Basic.randomize(list, 666)) { // need to save in random order
- dbxWriter.writeInt(interval.getStart());
- dbxWriter.writeInt(interval.getEnd());
- interval.getData().write(dbxWriter);
- }
- totalRefWithAGene++;
- }
- progress.incrementProgress();
- }
- }
-
- System.err.println(String.format("Reference sequences with at least one annotation: %,d of %,d", totalRefWithAGene, dnaId2list.size()));
- }
-}
=====================================
src/malt/tools/AAddRun.java deleted
=====================================
--- a/src/malt/tools/AAddRun.java
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (C) 2015 Daniel H. Huson
- *
- * (Some files contain contributions from other authors, who are then mentioned separately.)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-package malt.tools;
-
-import jloda.util.*;
-import malt.genes.GeneItem;
-import megan.io.FileRandomAccessReadOnlyAdapter;
-import megan.io.InputReader;
-import megan.util.interval.Interval;
-import megan.util.interval.IntervalTree;
-
-import java.io.*;
-import java.util.*;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.zip.GZIPOutputStream;
-
-/**
- * add functional annotations to DNA alignments
- * Daniel Huson, 5.2018
- */
-public class AAddRun {
- /**
- * add functional annotations to DNA alignments
- */
- public static void main(String[] args) {
- try {
- ProgramProperties.setProgramName("AAddRun");
- ProgramProperties.setProgramVersion(megan.main.Version.SHORT_DESCRIPTION);
-
- PeakMemoryUsageMonitor.start();
- (new AAddRun()).run(args);
- System.err.println("Total time: " + PeakMemoryUsageMonitor.getSecondsSinceStartString());
- System.err.println("Peak memory: " + PeakMemoryUsageMonitor.getPeakUsageString());
- System.exit(0);
- } catch (Exception ex) {
- Basic.caught(ex);
- System.exit(1);
- }
- }
-
- /**
- * run the program
- */
- public void run(String[] args) throws CanceledException, IOException, UsageException {
- final ArgsOptions options = new ArgsOptions(args, this, "Adds functional accessions to DNA alignments");
- options.setVersion(ProgramProperties.getProgramVersion());
- options.setLicense("Copyright (C) 2018 Daniel H. Huson. This program comes with ABSOLUTELY NO WARRANTY.");
- options.setAuthors("Daniel H. Huson");
-
- options.comment("Input Output");
- final String[] inputFiles = options.getOptionMandatory("-i", "input", "Input SAM file(s) (.gz ok)", new String[0]);
- final String indexDirectory = options.getOptionMandatory("-d", "index", "AAdd index directory", "");
- final String[] outputFiles = options.getOptionMandatory("-o", "output", "Output file(s) (.gz ok) or directory", new String[0]);
- options.comment(ArgsOptions.OTHER);
- final boolean reportUnmappedAccessions = options.getOption("-rnf", "reportNotFound", "Report the names of DNA reference for which no functional accession is available", false);
- options.done();
-
- final File outputDir;
- if (outputFiles.length == 1 && ((new File(outputFiles[0])).isDirectory())) {
- outputDir = new File(outputFiles[0]);
- } else {
- outputDir = null;
- if (inputFiles.length != outputFiles.length)
- throw new UsageException("Number of output files doesn't match number of input files");
- }
-
- final Map<String, Pair<Long, IntervalTree<GeneItem>>> ref2PosAndTree;
- final File indexFile = new File(indexDirectory, "aadd.idx");
-
- try (InputReader ins = new InputReader(indexFile); ProgressPercentage progress = new ProgressPercentage("Reading file: " + indexFile)) {
- readAndVerifyMagicNumber(ins, AAddBuild.MAGIC_NUMBER_IDX);
- final int entries = ins.readInt();
- progress.setMaximum(entries);
-
- ref2PosAndTree = new HashMap<>(2 * entries);
-
- for (int t = 0; t < entries; t++) {
- final String dnaId = ins.readString();
- final long pos = ins.readLong();
- ref2PosAndTree.put(dnaId, new Pair<Long, IntervalTree<GeneItem>>(pos, null));
- progress.incrementProgress();
- }
- }
-
- final IntervalTree<GeneItem> emptyTree = new IntervalTree<>();
-
- final File dbFile = new File(indexDirectory, "aadd.dbx");
-
- try (FileRandomAccessReadOnlyAdapter dbxIns = new FileRandomAccessReadOnlyAdapter(dbFile)) {
- System.err.println("Opening file: " + dbFile);
-
- for (int i = 0; i < inputFiles.length; i++) {
- File inputFile = new File(inputFiles[i]);
- final File outputFile;
- if (outputDir != null) {
- outputFile = new File(outputDir, inputFile.getName() + ".out");
- } else
- outputFile = new File(outputFiles[i]);
- if (inputFile.equals(outputFile))
- throw new IOException("Input file equals output file: " + inputFile);
- final boolean gzipOutput = outputFile.getName().toLowerCase().endsWith(".gz");
-
- long countLines = 0;
- long countAlignments = 0;
- long countAnnotated = 0;
- long countReferencesLoaded = 0;
-
- final Set<String> refNotFound = new HashSet<>();
-
- try (final FileInputIterator it = new FileInputIterator(inputFile, true);
- final BufferedWriter w = new BufferedWriter(new OutputStreamWriter(gzipOutput ? new GZIPOutputStream(new FileOutputStream(outputFile)) : new FileOutputStream(outputFile)))) {
- System.err.println("Writing file: " + outputFile);
-
- while (it.hasNext()) {
- final String aLine = it.next();
- if (aLine.startsWith("@"))
- w.write(aLine);
- else {
-
- final String[] tokens = Basic.split(aLine, '\t');
-
- if (tokens.length < 2 || tokens[2].equals("*")) {
- w.write(aLine);
- } else {
- final IntervalTree<GeneItem> tree;
- {
- final int pos = tokens[2].indexOf(".");
- final String ref = (pos > 0 ? tokens[2].substring(0, pos) : tokens[2]);
-
- final Pair<Long, IntervalTree<GeneItem>> pair = ref2PosAndTree.get(ref);
-
- if (pair != null) {
- if (pair.getSecond() == null && pair.getFirst() != 0) {
- dbxIns.seek(pair.getFirst());
-
- int intervalsLength = dbxIns.readInt();
- if (intervalsLength > 0) {
- tree = new IntervalTree<>();
- for (int t = 0; t < intervalsLength; t++) {
- int start = dbxIns.readInt();
- int end = dbxIns.readInt();
- GeneItem geneItem = new GeneItem();
- geneItem.read(dbxIns);
- tree.add(start, end, geneItem);
- //System.err.println(refIndex+"("+start+"-"+end+") -> "+geneItem);
- }
- countReferencesLoaded++;
- } else {
- tree = emptyTree;
- }
- pair.setSecond(tree);
- } else {
- tree = pair.getSecond();
- }
- } else {
- if (!refNotFound.contains(ref)) {
- refNotFound.add(ref);
- if (reportUnmappedAccessions)
- System.err.println("Reference not found: " + ref);
- }
- continue;
- }
- }
-
- final int startSubject = Basic.parseInt(tokens[3]);
- final int endSubject = startSubject + getRefLength(tokens[5]) - 1;
-
- final Interval<GeneItem> refInterval = tree.getBestInterval(new Interval<GeneItem>(startSubject, endSubject, null), 0.9);
-
- String annotatedRef = tokens[2];
- if (refInterval != null) {
- final GeneItem geneItem = refInterval.getData();
- final String remainder;
- final int len = annotatedRef.indexOf(' ');
- if (len >= 0 && len < annotatedRef.length()) {
- remainder = annotatedRef.substring(len); // keep space...
- annotatedRef = annotatedRef.substring(0, len);
- } else
- remainder = "";
- annotatedRef += (annotatedRef.endsWith("|") ? "pos|" : "|pos|") + (geneItem.isReverse() ? refInterval.getEnd() + ".." + refInterval.getStart()
- : refInterval.getStart() + ".." + refInterval.getEnd()) + "|ref|" + Basic.toString(geneItem.getProteinId()) + remainder;
- }
- for (int t = 0; t < tokens.length; t++) {
- if (t > 0)
- w.write('\t');
- if (t == 2 && !annotatedRef.equals(tokens[2])) {
- w.write(annotatedRef);
- countAnnotated++;
- } else
- w.write(tokens[t]);
- }
- }
- countAlignments++;
- }
- w.write("\n");
- countLines++;
- }
- }
-
- System.err.println(String.format("Lines: %,11d", countLines));
- System.err.println(String.format("Alignments:%,11d", countAlignments));
- System.err.println(String.format("Annotated: %,11d", countAnnotated));
- System.err.println(String.format("(Loaded refs:%,9d)", countReferencesLoaded));
- if (refNotFound.size() > 0)
- System.err.println(String.format("(Missing refs:%,8d)", refNotFound.size()));
- }
- }
- }
-
- private static Pattern pattern = Pattern.compile("[0-9]+[MDN]+");
-
- public static int getRefLength(String cigar) {
- final Matcher matcher = pattern.matcher(cigar);
- final ArrayList<String> pairs = new ArrayList<>();
- while (matcher.find())
- pairs.add(matcher.group());
-
- int length = 0;
- for (String p : pairs) {
- int num = Integer.parseInt(p.substring(0, p.length() - 1));
- length += num;
- }
- return length;
-
- }
-
- /**
- * read and verify a magic number from a stream
- *
- * @param ins
- * @param expectedMagicNumber
- * @throws java.io.IOException
- */
- public static void readAndVerifyMagicNumber(InputReader ins, byte[] expectedMagicNumber) throws IOException {
- byte[] magicNumber = new byte[expectedMagicNumber.length];
- if (ins.read(magicNumber, 0, magicNumber.length) != expectedMagicNumber.length || !Basic.equal(magicNumber, expectedMagicNumber)) {
- System.err.println("Expected: " + Basic.toString(expectedMagicNumber));
- System.err.println("Got: " + Basic.toString(magicNumber));
- throw new IOException("Index is too old or incorrect file (wrong magic number). Please recompute index.");
- }
- }
-
-}
=====================================
tex/manual/manual.pdf
=====================================
Binary files a/tex/manual/manual.pdf and b/tex/manual/manual.pdf differ
=====================================
tex/manual/versioninfo.tex
=====================================
--- a/tex/manual/versioninfo.tex
+++ b/tex/manual/versioninfo.tex
@@ -1 +1 @@
-\def\VERSION{{0.4.0}}
+\def\VERSION{{0.4.1}}
=====================================
tex/manual/versioninfo_new.tex
=====================================
--- a/tex/manual/versioninfo_new.tex
+++ b/tex/manual/versioninfo_new.tex
@@ -1 +1 @@
-\def\VERSION{{0.4.0}}
+\def\VERSION{{0.4.1}}
=====================================
tex/manual/versioninfo_old.tex
=====================================
--- a/tex/manual/versioninfo_old.tex
+++ b/tex/manual/versioninfo_old.tex
@@ -1 +1 @@
-\def\VERSION{{0.4.0}}
+\def\VERSION{{0.4.1}}
View it on GitLab: https://salsa.debian.org/med-team/malt/commit/d3e6b2a146f7bb1987b40e84de890a22e9b62a60
--
View it on GitLab: https://salsa.debian.org/med-team/malt/commit/d3e6b2a146f7bb1987b40e84de890a22e9b62a60
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20180804/524b6580/attachment-0001.html>
More information about the debian-med-commit
mailing list