[med-svn] [ruby-rgfa] 01/02: New upstream version 1.2
Sascha Steinbiss
satta at debian.org
Sat Sep 24 20:56:14 UTC 2016
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository ruby-rgfa.
commit e74c46300a5664c9aaf0bdd7746590ddc26c1f57
Author: Sascha Steinbiss <satta at debian.org>
Date: Sat Sep 24 17:19:30 2016 +0000
New upstream version 1.2
---
.gitignore | 10 +
CHANGELOG | 24 +
CONTRIBUTORS | 4 +
LICENSE | 19 +
README.md | 43 ++
Rakefile | 53 ++
bin/gfadiff.rb | 420 +++++++++++++
bin/rgfa-findcrisprs.rb | 208 +++++++
bin/rgfa-mergelinear.rb | 14 +
bin/rgfa-simdebruijn.rb | 86 +++
cheatsheet/rgfa-cheatsheet-1.2.tex | 177 ++++++
lib/rgfa.rb | 376 +++++++++++
lib/rgfa/byte_array.rb | 74 +++
lib/rgfa/cigar.rb | 156 +++++
lib/rgfa/connectivity.rb | 131 ++++
lib/rgfa/containments.rb | 97 +++
lib/rgfa/error.rb | 3 +
lib/rgfa/field_array.rb | 87 +++
lib/rgfa/field_parser.rb | 109 ++++
lib/rgfa/field_validator.rb | 241 ++++++++
lib/rgfa/field_writer.rb | 108 ++++
lib/rgfa/headers.rb | 76 +++
lib/rgfa/line.rb | 721 ++++++++++++++++++++++
lib/rgfa/line/containment.rb | 87 +++
lib/rgfa/line/header.rb | 92 +++
lib/rgfa/line/link.rb | 379 ++++++++++++
lib/rgfa/line/path.rb | 106 ++++
lib/rgfa/line/segment.rb | 207 +++++++
lib/rgfa/linear_paths.rb | 285 +++++++++
lib/rgfa/lines.rb | 155 +++++
lib/rgfa/links.rb | 242 ++++++++
lib/rgfa/logger.rb | 192 ++++++
lib/rgfa/multiplication.rb | 156 +++++
lib/rgfa/numeric_array.rb | 196 ++++++
lib/rgfa/paths.rb | 98 +++
lib/rgfa/rgl.rb | 194 ++++++
lib/rgfa/segment_ends_path.rb | 7 +
lib/rgfa/segment_info.rb | 162 +++++
lib/rgfa/segments.rb | 99 +++
lib/rgfa/sequence.rb | 65 ++
lib/rgfatools.rb | 102 +++
lib/rgfatools/artifacts.rb | 29 +
lib/rgfatools/copy_number.rb | 126 ++++
lib/rgfatools/invertible_segments.rb | 104 ++++
lib/rgfatools/linear_paths.rb | 140 +++++
lib/rgfatools/multiplication.rb | 194 ++++++
lib/rgfatools/p_bubbles.rb | 66 ++
lib/rgfatools/superfluous_links.rb | 64 ++
pdfdoc/cover.css | 4 +
pdfdoc/cover.html | 14 +
pdfdoc/print.css | 2 +
rgfa.gemspec | 68 ++
test/test_rgfa.rb | 101 +++
test/test_rgfa_byte_array.rb | 41 ++
test/test_rgfa_cigar.rb | 33 +
test/test_rgfa_edit.rb | 96 +++
test/test_rgfa_field_parser.rb | 55 ++
test/test_rgfa_field_validator.rb | 56 ++
test/test_rgfa_field_writer.rb | 45 ++
test/test_rgfa_line.rb | 199 ++++++
test/test_rgfa_line_containment.rb | 43 ++
test/test_rgfa_line_creators.rb | 143 +++++
test/test_rgfa_line_destructors.rb | 93 +++
test/test_rgfa_line_getters.rb | 246 ++++++++
test/test_rgfa_line_header.rb | 17 +
test/test_rgfa_line_link.rb | 43 ++
test/test_rgfa_line_path.rb | 48 ++
test/test_rgfa_line_segment.rb | 64 ++
test/test_rgfa_segment_references.rb | 20 +
test/test_rgfa_sequence.rb | 19 +
test/test_rgfa_traverse.rb | 96 +++
test/test_rgfatools.rb | 11 +
test/test_rgfatools_artifacts.rb | 34 +
test/test_rgfatools_copy_number.rb | 44 ++
test/test_rgfatools_linear_paths.rb | 52 ++
test/test_rgfatools_multiplication.rb | 183 ++++++
test/testdata/dead_ends.gfa | 12 +
test/testdata/example1.gfa | 45 ++
test/testdata/example_from_spec.gfa | 9 +
test/testdata/example_from_spec.path14.seq | 1 +
test/testdata/example_from_spec2.gfa | 13 +
test/testdata/links_distri.l1.gfa | 4 +
test/testdata/links_distri.l1.m2.gfa | 6 +
test/testdata/links_distri.l2.gfa | 6 +
test/testdata/links_distri.l2.m2.gfa | 7 +
test/testdata/links_distri.l2.m2.no_ld.gfa | 9 +
test/testdata/links_distri.l2.m3.gfa | 8 +
test/testdata/links_distri.l2.m3.no_ld.gfa | 12 +
test/testdata/links_distri.l3.gfa | 8 +
test/testdata/links_distri.l3.m2.gfa | 10 +
test/testdata/links_distri.l3.m2.no_ld.gfa | 12 +
test/testdata/loop.gfa | 10 +
test/testdata/sample.gfa | 12 +
test/testdata/spec_q1.gfa | 8 +
test/testdata/spec_q2.gfa | 9 +
test/testdata/spec_q2.path_circular.seq | 1 +
test/testdata/spec_q2.path_linear.seq | 1 +
test/testdata/spec_q3.gfa | 13 +
test/testdata/spec_q4.gfa | 14 +
test/testdata/spec_q4.path_more_than_circular.seq | 1 +
test/testdata/spec_q5.gfa | 11 +
test/testdata/spec_q6.gfa | 9 +
test/testdata/spec_q7.gfa | 9 +
test/testdata/two_components.gfa | 11 +
104 files changed, 8925 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ec77918
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.gem
+doc
+.yardoc
+pdfdoc/index.html
+pdfdoc/rgfa*.pdf
+cheatsheet/*.aux
+cheatsheet/*_latexmk
+cheatsheet/*.fls
+cheatsheet/*.log
+cheatsheet/*.pdf
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 0000000..04c0812
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,24 @@
+== 1.2 ==
+
+- merge RGFATools into the main RGFA gem
+
+== 1.1 ==
+
+- performance and code organization improvements
+- Line code rewritten:
+-- uses an Hash instead of Array for fields data
+-- support lazy parsing of some field types
+-- simpler code for subclasses
+-- define a datatype for required fields
+-- Field[Parser|Validator|Writer]
+- Optfield class replaced
+- use symbols instead of strings when appropriate
+
+== 1.0.1 ==
+
+- complete YARD documentation
+- remove redundant/unused code
+
+== 1.0 ==
+
+- initial release
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 0000000..f3c4de1
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,4 @@
+The following contributors helped to develop RGFA. Please drop a note to
+gonnella at zbh.uni-hamburg.de if I left someone out or missed something.
+
+- Stefan Kurtz (advises)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..17c66c3
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,19 @@
+All code of RGFA is released under the following ISC license.
+It is functionally equivalent to a two-term BSD copyright with
+language removed that is made unnecessary by the Berne convention.
+See http://openbsd.org/policy.html for more information on copyrights.
+
+Copyright (c) 2016 Giorgio Gonnella and CONTRIBUTORS
+Copyright (c) 2016 Center for Bioinformatics, University of Hamburg
+
+Permission to use, copy, modify, and distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..60e527d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,43 @@
+The Graphical Fragment Assembly (GFA) is a proposed format which allow
+to describe the product of sequence assembly.
+This gem implements the proposed specifications for the GFA format
+described under https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md
+as close as possible.
+
+The library allows to create a RGFA object from a file in the GFA format
+or from scratch, to enumerate the graph elements (segments, links,
+containments, paths and header lines), to traverse the graph (by
+traversing all links outgoing from or incoming to a segment), to search for
+elements (e.g. which links connect two segments) and to manipulate the
+graph (e.g. to eliminate a link or a segment or to duplicate a segment
+distributing the read counts evenly on the copies).
+
+## Usage
+
+After installation of the gem (rake install), the library can be included
+in the own scripts with require "rgfa". Additional functionality, which
+requires custom tags and additional conventions, is included in a separate
+part of the code named "RGFATools" and can be accessed with require "rgfatools".
+
+## Documentation
+
+A cheatsheet is available as pdf under
+https://github.com/ggonnella/rgfa/blob/master/cheatsheet/rgfa-cheatsheet-1.2.pdf
+
+The full API documentation is available as pdf under
+https://github.com/ggonnella/rgfa/blob/master/pdfdoc/rgfa-api-1.2.pdf
+or in HTML format (http://www.rubydoc.info/github/ggonnella/rgfa/master/RGFA).
+
+The main class of the library is {RGFA}, which is a good starting point
+when reading the documentation.
+
+## References
+
+The manuscript describing the library has been presented at the
+German Conference on Bioinformatics 2016. Currently it is under review and
+available as a Peer Journal preprint:
+
+Gonnella G, Kurtz S. (2016) RGFA: powerful and convenient handling of
+assembly graphs. PeerJ Preprints 4:e2381v1
+https://doi.org/10.7287/peerj.preprints.2381v1
+
diff --git a/Rakefile b/Rakefile
new file mode 100644
index 0000000..40ba406
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,53 @@
+require "rake/testtask"
+
+$rgfaversion="1.2"
+
+Rake::TestTask.new do |t|
+ t.libs << 'test'
+end
+
+desc "Run tests"
+task :default => :test
+
+desc "Build gem"
+task :build do
+ system("gem build rgfa.gemspec")
+end
+
+desc "Install gem"
+task :install => :build do
+ system("gem install rgfa")
+end
+
+desc "Rm files created by rake build"
+task :clean do
+ system("rm -f rgfa-*.gem")
+end
+
+# make documentation generation tasks
+# available only if yard gem is installed
+begin
+ require "yard"
+ YARD::Tags::Library.define_tag("Developer notes", :developer)
+ YARD::Rake::YardocTask.new do |t|
+ t.files = ['lib/**/*.rb']
+ t.stats_options = ['--list-undoc']
+ end
+rescue LoadError
+end
+
+desc "Create cheatsheet"
+task :cs do
+ system("latexmk cheatsheet/rgfa-cheatsheet-#$rgfaversion.tex "+
+ "-pdf -outdir=cheatsheet")
+end
+
+desc "Create a PDF documentation"
+task :pdf do
+ system("yard2.0 --one-file -o pdfdoc")
+ system("wkhtmltopdf cover pdfdoc/cover.html "+
+ "toc "+
+ "pdfdoc/index.html "+
+ "--user-style-sheet pdfdoc/print.css "+
+ "pdfdoc/rgfa-api-#$rgfaversion.pdf")
+end
diff --git a/bin/gfadiff.rb b/bin/gfadiff.rb
new file mode 100755
index 0000000..c6a4720
--- /dev/null
+++ b/bin/gfadiff.rb
@@ -0,0 +1,420 @@
+#!/usr/bin/env ruby
+
+require "rgfa"
+
+rt = []
+all_rt = %W[-h -s -l -c -p]
+all_rt.each do |rtopt|
+ rt << ARGV.delete(rtopt)
+end
+rt.compact!
+rt = all_rt if rt.empty?
+
+out_identical = ARGV.delete("-i")
+
+out_script = ARGV.delete("-script")
+
+if ARGV.size != 2
+ STDERR.puts "Compare two RGFA files"
+ STDERR.puts
+ STDERR.puts "Usage: #$0 [-h] [-s] [-l] [-c] [-p] [-i] [-script] <gfa1> <gfa2>"
+ STDERR.puts
+ STDERR.puts "If a combination of -h,-s,-l,-c and/or -p is specified, then"
+ STDERR.puts "only record of the specified type [h=headers, s=segments, "
+ STDERR.puts "l=links, c=containments, p=paths] are compared. "
+ STDERR.puts "(default: -h -s -l -c -p)"
+ STDERR.puts
+ STDERR.puts "Other options:"
+ STDERR.puts " -i: output msg if identical"
+ STDERR.puts " -script: create ruby script to transform gfa1 in gfa2"
+ exit 1
+end
+
+if out_script
+ puts "#!/usr/bin/env ruby"
+ puts
+ puts "#"
+ puts "# This script was automatically generated using gfadiff.rb"
+ puts "#"
+ puts "# Purpose: edit gfa1 into gfa2"
+ puts "#"
+ puts "# gfa1: #{ARGV[0]}"
+ puts "# gfa2: #{ARGV[1]}"
+ puts "#"
+ puts
+ puts "require \"rgfa\""
+ puts
+ puts "gfa = RGFA.from_file(\"#{ARGV[0]}\")"
+ puts
+end
+
+gfa1 = RGFA.new
+gfa1.turn_off_validations
+gfa1.read_file(ARGV[0])
+gfa2 = RGFA.new
+gfa2.turn_off_validations
+gfa2.read_file(ARGV[1])
+
+if rt.include?("-h")
+ h1 = gfa1.header
+ h2 = gfa2.header
+ if h1 == h2
+ if out_identical
+ puts "# Header values are identical"
+ elsif out_script
+ puts "# Headers"
+ puts "# ... are identical"
+ puts
+ end
+ else
+ if out_script
+ puts "# Headers"
+ puts
+ end
+ (h1.optional_fieldnames - h2.optional_fieldnames).each do |k|
+ if out_script
+ puts "gfa.header.delete_field(#{k.inspect})"
+ else
+ v = h1.get(k)
+ if v.kind_of?(RGFA::FieldArray)
+ t = v.datatype
+ v.each do |elem|
+ elem = elem.to_gfa_field(datatype: t)
+ puts "<\t[headers/exclusive/multivalue/#{k}]\t#{elem}"
+ end
+ else
+ v = h1.field_to_s(k, optfield: true)
+ puts "M\t[headers/exclusive]\t#{k.inspect}\t#{v}"
+ end
+ end
+ end
+ (h2.optional_fieldnames - h1.optional_fieldnames).each do |k|
+ v = h2.get(k)
+ if out_script
+ t = h2.get_datatype(k)
+ puts "gfa.header.set_datatype(#{k.inspect}, #{t.inspect})"
+ if v.kind_of?(RGFA::FieldArray)
+ t = v.datatype
+ v.each do |elem|
+ puts "gfa.header.add(#{k.inspect}, #{elem.inspect}, "+
+ "#{t.inspect})"
+ end
+ else
+ puts "gfa.header.#{k}=#{v.inspect}"
+ end
+ else
+ if v.kind_of?(RGFA::FieldArray)
+ t = v.datatype
+ v.each do |elem|
+ elem = elem.to_gfa_field(datatype: t)
+ puts ">\t[headers/exclusive/multivalue/#{k}]\t#{elem}"
+ end
+ else
+ v = h2.field_to_s(k, optfield: true)
+ puts ">\t[headers/exclusive]\t#{k.inspect}\t#{v}"
+ end
+ end
+ end
+ (h1.optional_fieldnames & h2.optional_fieldnames).each do |k|
+ v1 = h1.get(k)
+ v2 = h2.get(k)
+ v1a = v1.kind_of?(RGFA::FieldArray) ? v1.sort : [v1]
+ v2a = v2.kind_of?(RGFA::FieldArray) ? v2.sort : [v2]
+ t1 = v1.kind_of?(RGFA::FieldArray) ? v1.datatype : h1.get_datatype(k)
+ t2 = v2.kind_of?(RGFA::FieldArray) ? v2.datatype : h2.get_datatype(k)
+ m1 = v1.kind_of?(RGFA::FieldArray) ? "multivalue/" : ""
+ m2 = v2.kind_of?(RGFA::FieldArray) ? "multivalue/" : ""
+ if out_script
+ if t1 != t2 or v1a != v2a
+ puts "gfa.header.delete(#{k.inspect})"
+ v2a.each do |v2|
+ v2 = v2.to_gfa_field(datatype: t2)
+ puts "gfa.header.add(#{k.inspect}, #{v2.inspect}, "+
+ "#{t2.inspect})"
+ end
+ end
+ else
+ if t1 != t2
+ v1a.each do |v1|
+ v1 = v1.to_gfa_field(datatype: t1)
+ puts "<\t[headers/typediff/#{m1}#{k}#{}]\t#{v1}"
+ end
+ v2a.each do |v2|
+ v2 = v2.to_gfa_field(datatype: t2)
+ puts ">\t[headers/typediff/#{m2}#{k}]\t#{v2}"
+ end
+ else
+ (v1a-v2a).each do |v1|
+ v1 = v1.to_gfa_field(datatype: t1)
+ puts "<\t[headers/valuediff/#{m1}#{k}]\t#{v1}"
+ end
+ (v2a-v1a).each do |v2|
+ v2 = v2.to_gfa_field(datatype: t2)
+ puts ">\t[headers/valuediff/#{m2}#{k}]\t#{v2}"
+ end
+ end
+ end
+ end
+ if out_script
+ puts
+ end
+ end
+end
+
+def diff_segments_or_paths(gfa1,gfa2,rt,out_script,out_identical)
+ rts = rt + "s"
+ rtsU = rts[0].upcase + rts[1..-1]
+ s1names = gfa1.send("#{rt}_names").sort
+ s2names = gfa2.send("#{rt}_names").sort
+ difffound = false
+ if out_script
+ puts "# #{rtsU}"
+ puts
+ end
+ (s1names - s2names).each do |sn|
+ difffound = true
+ segstr = gfa1.send(rt,sn).to_s
+ if out_script
+ puts "gfa.rm(#{sn.inspect})"
+ else
+ puts "<\t[#{rts}/exclusive]\t#{segstr}"
+ end
+ end
+ (s2names - s1names).each do |sn|
+ difffound = true
+ segstr = gfa2.send(rt,sn).to_s
+ if out_script
+ puts "gfa << #{segstr.inspect}"
+ else
+ puts ">\t[#{rts}/exclusive]\t#{segstr}"
+ end
+ end
+ (s1names & s2names).each do |sn|
+ s1 = gfa1.send(rt,sn)
+ s2 = gfa2.send(rt,sn)
+ s1.required_fieldnames.each do |fn|
+ v1 = s1.field_to_s(fn)
+ v2 = s2.field_to_s(fn)
+ if v1 != v2
+ difffound = true
+ if out_script
+ puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v2.inspect}"
+ else
+ puts "<\t[#{rts}/reqfields/valuediff/#{sn}]\t#{v1}"
+ puts ">\t[#{rts}/reqfields/valuediff/#{sn}]\t#{v2}"
+ end
+ end
+ end
+ s1f = s1.optional_fieldnames
+ s2f = s2.optional_fieldnames
+ (s1f - s2f).each do |fn|
+ difffound = true
+ if out_script
+ puts "gfa.#{rt}(#{sn.inspect}).delete_field(#{fn.inspect})"
+ else
+ v = s1.field_to_s(fn, optfield: true)
+ puts "<\t[#{rts}/optfields/exclusive/#{sn}]\t#{v}"
+ end
+ end
+ (s2f - s1f).each do |fn|
+ difffound = true
+ if out_script
+ v = s2.get(fn)
+ t = s2.get_datatype(fn)
+ puts "gfa.#{rt}(#{sn.inspect}).set_datatype(#{fn.inspect},#{t})"
+ puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v.inspect}"
+ else
+ v = s2.field_to_s(fn, optfield: true)
+ puts ">\t[#{rts}/optfields/exclusive/#{sn}]\t#{v}"
+ end
+ end
+ (s1f & s2f).each do |fn|
+ v1 = s1.field_to_s(fn, optfield: true)
+ v2 = s2.field_to_s(fn, optfield: true)
+ if v1 != v2
+ difffound = true
+ if out_script
+ v = s2.get(fn)
+ t = s2.get_datatype(fn)
+ puts "gfa.#{rt}(#{sn.inspect}).set_datatype(#{fn.inspect},#{t})"
+ puts "gfa.#{rt}(#{sn.inspect}).#{fn}=#{v.inspect}"
+ else
+ puts "<\t[#{rts}/optfields/valuediff/#{sn}]\t#{v1}"
+ puts ">\t[#{rts}/optfields/valuediff/#{sn}]\t#{v2}"
+ end
+ end
+ end
+ end
+ if !difffound
+ if out_script
+ puts "# ... are identical"
+ elsif out_identical
+ puts "# #{rtsU} are identical"
+ end
+ end
+ puts if out_script
+end
+
+if rt.include?("-s")
+ diff_segments_or_paths(gfa1,gfa2, "segment",out_script,out_identical)
+end
+
+# TODO: diff of single optfields
+if rt.include?("-l")
+ difffound = false
+ s1names = gfa1.segment_names.sort
+ s2names = gfa2.segment_names.sort
+ if out_script
+ puts "# Links"
+ puts
+ end
+ difflinks1 = []
+ (s1names - s2names).each do |sn|
+ difffound = true
+ [:B, :E].each {|et| difflinks1 += gfa1.links_of([sn, et])}
+ end
+ difflinks1.uniq.each do |l|
+ if !out_script
+ puts "<\t[links/exclusive_segments]\t#{l.to_s}"
+ end
+ end
+ difflinks2 = []
+ (s2names - s1names).each do |sn|
+ difffound = true
+ [:B, :E].each {|et| difflinks2 += gfa2.links_of([sn, et])}
+ end
+ difflinks2.uniq.each do |l|
+ if out_script
+ puts "gfa << #{l.to_s.inspect}"
+ else
+ puts ">\t[links/exclusive_segments]\t#{l.to_s}"
+ end
+ end
+ difflinks1b = []
+ difflinks2b = []
+ (s1names & s2names).each do |sn|
+ [:B, :E].each do |et|
+ l1 = gfa1.links_of([sn, et])
+ l2 = gfa2.links_of([sn, et])
+ d1 = l1 - l2
+ d2 = l2 - l1
+ if !d1.empty?
+ difffound = true
+ difflinks1b += d1
+ end
+ if !d2.empty?
+ difffound = true
+ difflinks2b += d2
+ end
+ end
+ end
+ (difflinks1b-difflinks1).uniq.each do |l|
+ if out_script
+ puts "gfa.rm(gfa.link_from_to(#{l.from.to_sym.inspect}, "+
+ "#{l.from_orient.inspect}, "+
+ "#{l.to.to_sym.inspect}, "+
+ "#{l.to_orient.inspect}, "+
+ "#{l.overlap.to_s.inspect}.to_cigar))"
+ else
+ puts "<\t[links/different]\t#{l.to_s}"
+ end
+ end
+ (difflinks2b-difflinks2).uniq.each do |l|
+ if out_script
+ puts "gfa << #{l.to_s.inspect}"
+ else
+ puts ">\t[links/different]\t#{l.to_s}"
+ end
+ end
+ if !difffound
+ if out_script
+ puts "# ... are identical"
+ elsif out_identical
+ puts "# Links are identical"
+ end
+ end
+ puts if out_script
+end
+
+# TODO: this code is similar to -l; make generic and merge
+if rt.include?("-c")
+ difffound = false
+ s1names = gfa1.segment_names.sort
+ s2names = gfa2.segment_names.sort
+ cexcl1 = []
+ (s1names - s2names).each do |sn|
+ difffound = true
+ cexcl1 += gfa1.contained_in(sn)
+ cexcl1 += gfa1.containing(sn)
+ end
+ cexcl1.uniq.each do |c|
+ if !out_script
+ puts "<\t[contaiments/exclusive_segments]\t#{c.to_s}"
+ end
+ end
+ cexcl2 = []
+ (s2names - s1names).each do |sn|
+ difffound = true
+ cexcl2 += gfa2.contained_in(sn)
+ cexcl2 += gfa2.containing(sn)
+ end
+ cexcl2.uniq.each do |c|
+ if out_script
+ puts "gfa << #{c.to_s.inspect}"
+ else
+ puts ">\t[contaiments/exclusive_segments]\t#{c.to_s}"
+ end
+ end
+ cdiff1 = []
+ cdiff2 = []
+ (s1names & s2names).each do |sn|
+ c1 = gfa1.contained_in(sn)
+ c2 = gfa2.contained_in(sn)
+ c1 += gfa1.containing(sn)
+ c2 += gfa2.containing(sn)
+ d1 = c1 - c2
+ d2 = c2 - c1
+ if !d1.empty?
+ difffound = true
+ cdiff1 += d1
+ end
+ if !d2.empty?
+ difffound = true
+ cdiff2 += d2
+ end
+ end
+ (cdiff1-cexcl1).uniq.each do |l|
+ if out_script
+ # TODO: handle multiple containments for a segments pair
+ puts "gfa.rm(gfa.containment(#{l.from.to_sym.inspect}, "+
+ "#{l.to.to_sym.inspect}))"
+ else
+ puts "<\t[containments/different]\t#{l.to_s}"
+ end
+ end
+ (cdiff2-cexcl2).uniq.each do |l|
+ if out_script
+ puts "gfa << #{l.to_s.inspect}"
+ else
+ puts ">\t[containments/different]\t#{l.to_s}"
+ end
+ end
+ if !difffound
+ if out_script
+ puts "# ... are identical"
+ elsif out_identical
+ puts "# Containments are identical"
+ end
+ end
+ puts if out_script
+end
+
+if rt.include?("-p")
+ diff_segments_or_paths(gfa1,gfa2,"path",out_script,out_identical)
+end
+
+if out_script
+ puts
+ puts "# Output graph"
+ puts "puts gfa"
+end
diff --git a/bin/rgfa-findcrisprs.rb b/bin/rgfa-findcrisprs.rb
new file mode 100755
index 0000000..cc32cf0
--- /dev/null
+++ b/bin/rgfa-findcrisprs.rb
@@ -0,0 +1,208 @@
+#!/usr/bin/env ruby
+
+require "rgfatools"
+
+# crisprs have a structure ARU1RU..RUnRB where |U|~|R| in [24..50]
+
+$debugmode = false
+$spacersonly = false
+
+class RGFA
+
+ def find_crisprs(minrepeats=3,minlen=24,maxlen=50)
+ ls = {}
+ segment_names.each do |sn|
+ s = segment(sn)
+ s.cn = (s.coverage(unit_length: @default[:unit_length],
+ count_tag: @default[:count_tag])/2).round
+ end
+ output_segment_infos if $debugmode
+ maxvisits_global = {:B => {}, :E => {}}
+ segment_names.each do |sn|
+ s = segment(sn)
+ next if s.length < minlen or s.length > maxlen
+ next if s.cn < minrepeats
+ circles = {}
+ linear = {}
+ maxvisits = {}
+ [:B, :E].each do |rt|
+ maxvisits[rt] = maxvisits_global[rt].dup
+ maxvisits[rt][sn] ||= s.cn
+ circles[rt] = []
+ linear[rt] = []
+ segment_end = [s, rt].to_segment_end
+ links_of(segment_end).each do |l|
+ search_circle(segment_end.invert_end_type,
+ segment_end,
+ l,
+ maxvisits[rt],0,
+ minlen,
+ maxlen*2+s.length,
+ [segment_end],
+ circles[rt],
+ linear[rt])
+ end
+ if maxvisits[rt][sn.to_sym] > 0
+ multi = {:l => [], :c => []}
+ [[linear[rt],:l], [circles[rt],:c]].each do |paths, pt|
+ paths.each do |c|
+ min_mv = s.cn
+ upto = (pt == :l ? -1 : -2)
+ c[0..upto].each do |csn, et|
+ mv = maxvisits[rt][csn.to_sym]
+ if mv < min_mv
+ min_mv = mv
+ end
+ end
+ if min_mv > 0
+ min_mv.times { multi[pt] << c.dup }
+ c[0..upto].each do |csn, et|
+ maxvisits[rt][csn.to_sym] -= min_mv
+ end
+ end
+ end
+ end
+ circles[rt] += multi[:c]
+ linear[rt] += multi[:l]
+ end
+ end
+ n_paths = (circles[:E].size+circles[:B].size+
+ linear[:E].size+linear[:B].size)
+ if (circles[:E].size - circles[:B].size).abs > 1
+ next
+ end
+ if (linear[:E].size - linear[:B].size).abs > 0
+ next
+ end
+ if linear[:E].size != 1
+ next
+ end
+ merged_circles = []
+ circles[:E].each {|c|merged_circles << merge_crisprs_path(c,s,:E)}
+ before = merge_crisprs_path(linear[:B].first,s,:B)
+ after = merge_crisprs_path(linear[:E].first,s,:E)
+ next if merged_circles.size < minrepeats
+ maxvisits_global = maxvisits
+ instances = 1
+ possible_instances = 0
+ merged_circles.each do |seq|
+ if seq.length > s.length + minlen
+ possible_instances += 1
+ end
+ instances += 1
+ end
+ if $spacersonly
+ puts merged_circles.sort.map(&:upcase)
+ else
+ puts "CRISP signature found in segment #{s.name}"
+ puts
+ puts " Before: sequence = ...#{before[-50..-1]}"
+ puts
+ if possible_instances > 0
+ instances = "#{instances}..#{instances+possible_instances}"
+ end
+ puts " Repeat: instances = #{instances}; "+
+ "length = #{s.length};\t"+
+ "sequence = #{s.sequence}"
+ puts
+ puts " Spacers:"
+ asterisk = false
+ merged_circles.each_with_index do |seq, i|
+ if seq.length > s.length + minlen
+ str = "=#{s.length}+2*#{(seq.length.to_f - s.length)/2}"
+ asterisk = true
+ this_asterisk = true
+ else
+ str = ""
+ this_asterisk = false
+ end
+ puts " (#{i+1}#{this_asterisk ? "*" : ""})\t"+
+ "length = #{seq.length}#{str};\tsequence = #{seq}"
+ end
+ if asterisk
+ puts
+ puts " * = possibly containing inexact repeat instance"
+ end
+ puts
+ puts "After: sequence = #{after[0..49]}..."
+ end
+ end
+ end
+
+ private
+
+ def output_segment_infos
+ segment_names.each do |sn|
+ s = segment(sn)
+ puts "#{s.name}\t#{s.cn}\t"+
+ "#{neighbours([s.name,:B]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
+ "#{neighbours([s.name,:E]).map{|nb|segment(nb.segment).cn}.inject(:+)}\t"+
+ "#{links_of([s.name,:B]).size}\t"+
+ "#{links_of([s.name,:E]).size}\t"+
+ "#{s.KC}\t#{s.length}"
+ end
+ end
+
+ def merge_crisprs_path(segpath, repeat, repeat_end)
+ merged = create_merged_segment(segpath, merged_name: :short,
+ disable_tracking: true)[0]
+ sequence = merged.sequence[repeat.
+ sequence.length..-(1+repeat.sequence.length)]
+ sequence = sequence.rc if repeat_end == :B
+ return sequence
+ end
+
+ def search_circle(goal, from, l, maxvisits, dist, mindist,
+ maxdist, path, circles, linear)
+ dest = l.other_end(from)
+ dest.segment = segment(dest.segment)
+ maxvisits[dest.name] ||= dest.segment.cn
+ se = dest.invert_end_type
+ if dest == goal
+ return if dist < mindist
+ new_path = path.dup
+ new_path << se
+ new_path[0..-2].each {|x| maxvisits[x.name] -= 1}
+ circles << new_path
+ return
+ end
+ return if maxvisits[dest.name] == 0
+ return if path.any?{|x|x.name==dest.name}
+ new_path = path.dup
+ new_path << se
+ dist += dest.segment.length - l.overlap.first.len
+ if dist > maxdist
+ new_path = path.dup
+ new_path << se
+ new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
+ linear << new_path
+ return
+ end
+ ls = links_of(se)
+ if ls.size == 0
+ new_path[0..-1].each {|x| maxvisits[x.name] -= 1}
+ linear << new_path
+ return
+ end
+ ls.each do |next_l|
+ next_dest = segment(next_l.other_end(se).segment)
+ maxvisits[next_dest.name] ||= next_dest.cn
+ next if maxvisits[next_dest.name] == 0
+ search_circle(goal,se,next_l,maxvisits,dist,mindist,maxdist,new_path,
+ circles,linear)
+ end
+ return
+ end
+
+end
+
+if (ARGV.size == 0)
+ STDERR.puts "Usage: #$0 <gfa>"
+ exit 1
+end
+gfa = RGFA.from_file(ARGV[0])
+gfa.set_default_count_tag(:KC)
+gfa.header.ks ||= gfa.segments[0].length + 1
+gfa.set_count_unit_length(gfa.header.ks-1)
+gfa.find_crisprs
+
diff --git a/bin/rgfa-mergelinear.rb b/bin/rgfa-mergelinear.rb
new file mode 100755
index 0000000..302f09f
--- /dev/null
+++ b/bin/rgfa-mergelinear.rb
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "rgfatools"
+
+if ARGV.size != 1
+ STDERR.puts "Usage: #$0 <gfa>"
+ exit 1
+end
+
+gfa = RGFA.new
+gfa.enable_progress_logging(part: 0.01)
+gfa.turn_off_validations
+gfa.read_file(ARGV[0])
+gfa.merge_linear_paths(disable_tracking: true, merged_name: :short)
+puts gfa
diff --git a/bin/rgfa-simdebruijn.rb b/bin/rgfa-simdebruijn.rb
new file mode 100755
index 0000000..f6c2fd8
--- /dev/null
+++ b/bin/rgfa-simdebruijn.rb
@@ -0,0 +1,86 @@
+#!/usr/bin/env ruby
+require "rgfatools"
+require "set"
+
+def read_sequences(filename, logger)
+ file = File.new(filename)
+ sequences = []
+ linecount = `wc -l #{filename}`.strip.split(" ")[0].to_i
+ logger.progress_init(:read_file, "lines", linecount,
+ "Parse sequences from file with #{linecount} lines")
+ file.each do |line|
+ if line[0]==">"
+ sequences << ""
+ else
+ sequences.last << line.chomp
+ end
+ logger.progress_log(:read_file)
+ end
+ logger.progress_end(:read_file)
+ file.close
+ return sequences
+end
+
+if ARGV.size != 2
+ STDERR.puts "Usage: #$0 <k> <genome.fas>"
+ exit 1
+end
+
+k = Integer(ARGV[0])
+
+logger = RGFA::Logger.new()
+logger.enable_progress(part: 0.1)
+sequences = read_sequences(ARGV[1], logger)
+logger.log("Sequence lengths (nt): #{sequences.map(&:size)}")
+segments = {}
+links = Set.new
+kmercount = sequences.map{|seq|seq.length-k+1}.inject(:+)
+logger.progress_init(:generate_graph, "kmers", kmercount,
+ "Create graph from #{kmercount} kmers")
+i=1
+sequences.each do |seq|
+ 0.upto(seq.length-k) do |pos|
+ kmer = seq[pos..(pos+k-1)].downcase
+ prefix = kmer[0..k-2]
+ suffix = kmer[1..k-1]
+ link = "L"
+ [prefix, suffix].each do |km1mer|
+ orient = "+"
+ km1mer_rc = km1mer.rc
+ if km1mer > km1mer_rc
+ km1mer = km1mer_rc
+ orient = "-"
+ end
+ s = segments[km1mer.to_sym]
+ if s.nil?
+ s = [i,0]
+ segments[km1mer.to_sym] = s
+ i+=1;
+ end
+ s[1] += 1
+ link << "\t#{s[0]}\t#{orient}"
+ end
+ link << "\t#{k-2}M"
+ links << link
+ logger.progress_log(:generate_graph, segments_added: i,
+ links_added: links.size)
+ end
+end
+logger.progress_end(:generate_graph)
+segmentscount = i-1
+linkscount = links.size
+puts "H\tks:i:#{k}"
+logger.progress_init(:write_segments, "segments", segmentscount,
+ "Output #{segmentscount} segments")
+segments.each do |km1mer, data|
+ puts "S\t#{data[0]}\t#{km1mer}\tKC:i:#{data[1]}"
+ logger.progress_log(:write_segments)
+end
+logger.progress_end(:write_segments)
+logger.progress_init(:write_links, "links", linkscount,
+ "Output #{linkscount} links")
+links.each do |link|
+ puts link
+ logger.progress_log(:write_links)
+end
+logger.progress_end(:write_links)
diff --git a/cheatsheet/rgfa-cheatsheet-1.2.tex b/cheatsheet/rgfa-cheatsheet-1.2.tex
new file mode 100644
index 0000000..e17ab77
--- /dev/null
+++ b/cheatsheet/rgfa-cheatsheet-1.2.tex
@@ -0,0 +1,177 @@
+\documentclass[12pt]{scrartcl}
+
+\usepackage{comment}
+\usepackage{fancyhdr}
+\usepackage{lastpage}
+\pagestyle{fancy}
+
+\usepackage{array}
+\fancyhf{}
+\renewcommand{\headrulewidth}{0pt}
+\rhead{\bfseries RGFA/RGFATools v.1.2 Cheatsheet (\thepage/\pageref{LastPage})
+\hspace{1.1cm}}
+\lfoot{\tiny \ \ \ \ \ \ \ \ Copyright (c) 2016, Giorgio Gonnella, ZBH, University of
+Hamburg, Germany. This document is under CC-BY-SA license.}
+
+\newcounter{cstablecounter}
+\setlength{\footskip}{0in}
+\usepackage[top=1in, bottom=0.25in, left=0.1in, right=0.1in]{geometry}
+
+\newcommand{\cstablestart}{
+\begin{center}
+% \large
+% \textbf{RGFA: Cheatsheet
+% (\refstepcounter{cstablecounter}\arabic{cstablecounter}/2)
+% }\\
+%\end{center}
+%\vspace*{\fill}
+%\begin{table}[h]
+\centering
+}
+\setlength{\baselineskip}{0pt}
+\setlength{\textfloatsep}{0pt plus 0pt minus 0pt}
+\setlength{\intextsep}{0pt plus 0pt minus 0pt}
+\setlength{\floatsep}{0pt plus 0pt minus 0pt}
+\newcommand{\cstableend}{
+\end{center}
+ %\end{table}
+%\vspace*{\fill}
+%\newpage
+}
+
+\begin{document}
+
+\cstablestart
+
+\begin{tabular}{|l|>{\ttfamily}l|}
+ \hline
+ Create graph & RGFA.new \\
+ \ldots from GFA file & RGFA.from\_file("filename")\\
+ \ldots from string & string.to\_rgfa \\
+ \ldots from string array (e.g.) & \verb/["H\tVN:i:1.0",/ \\
+ & \hspace{2.3mm}\verb/"S\tA\t*\tLN:i:1200"].to_rgfa/ \\
+ \hline
+ Write GFA to file & gfa.to\_file(filename) \\
+ Write GFA to standard output & puts gfa \\
+ Create deep copy & gfa.clone \\
+ Validate after manual edits & gfa.validate! \\
+ Output statistics (normal/compact) & puts gfa.info; puts gfa.info(true) \\
+ \hline
+ Turn off validations & gfa.turn\_off\_validations \\
+ Segments first & gfa.require\_segments\_first\_order \\
+ Enable progress logging & gfa.enable\_progress\_logging \\
+ \hline
+ Name of all segments & gfa.segment\_names \\
+ Name of all paths & gfa.path\_names \\
+ All segments, links, paths, etc & gfa.segments; gfa.links; gfa.paths; \ldots \\
+ Iterate over segments, links, etc & gfa.each\_segment \verb/{|s|...}/ \\
+ \hline
+ Find segment & gfa.segment(segment\_name) \\
+ \ldots exception if does not exist & gfa.segment!(segment\_name) \\
+ \hline
+ Find path & gfa.path(path\_name) \textrm{(or: }path!\textrm{)} \\
+ All paths through segment & gfa.paths\_with(segment\_name) \\
+ \hline
+ Find link & gfa.link(\verb/[:S1,:E]/,\verb/[:S2,:B]/) \textrm{(or: }link!\textrm{)} \\
+ (or, if multiple may exist) & gfa.links\_between(\verb/[:S1,:E]/,\verb/[:S2,:B]/) \\
+ All links of segment end & gfa.links\_of(\verb/[:S1,:E]/) \\
+ (also segment instead of name) &
+ gfa.links\_of(\verb/[segment!(:S1),:E]/) \\
+ Target of all links & gfa.neighbours(\verb/[:S1,:E]/) \\
+ \hline
+ Find containment & gfa.containment(container,~contained)\\
+ & gfa.containment!(container,~contained)\\
+ (or, if multiple may exist) & gfa.containments\_between(c\_ner,~c\_ned)\\
+ All containments for a segment & gfa.containing(contained)\\
+ & gfa.contained\_in(container)\\
+ \hline
+ Add line (examples) & gfa << "H\verb/\t/VZ:i:1.0" \\
+ & gfa << "S\verb/\t/a\verb/\t/*\verb/\t/LN:i:1200" \\
+ Rename segment or path & gfa.rename("old", "new") \\
+ \hline
+ Segment coverage & s.coverage\\
+ Segment coverage (more accurate) & s.coverage(unit\_length:~avreadlen)\\
+ Segment K-mer coverage & s.coverage(count\_tag:~:KC)\\
+ Segment length & s.length\\
+ Other end of a link & link.other\_end([s1,:E])\\
+ Other end of other end of link &
+ link.other\_end([s1,:E])\\
+ & \hspace{2.8cm}.revert\_end\_type\\
+ \hline
+ Read req.field/tag value & segment.from; segment.LN \\
+ \ldots raise if tag not available & segment.LN! \\
+ \ldots tag string & segment.field\_to\_s(:LN) \\
+ \hline
+ Set/create custom tag (ab, Z type) & segment.ab = "value" \\
+ \ldots of i or B/i type & s.ab = 12; \ \ \ s.ab = [1,2,3]\\
+ \ldots of f or B/f type & s.ab = 12.0; \ s.ab = [1.2,2.3,3.0] \\
+ \ldots of J type (hash/array) & s.ab = \{"a" => 12\}; s.ab = ["a","b",1] \\
+ \hline
+
+\end{tabular}
+
+\cstableend
+
+\cstablestart
+
+\begin{tabular}{|l|>{\ttfamily}l|}
+ \hline
+ Delete segment (and its links, etc) & gfa.rm("a") \\
+ Delete path & gfa.rm("path1") \\
+ Delete link/containment & gfa.rm(gfa.link(\ldots)) \\
+ Delete all headers & gfa.rm(:headers) \\
+ Delete sequences (set all to \texttt{*}) & gfa.rm(:sequences) \\
+ \hline
+ \textit{(rm with a method)} &\\
+ Delete links of segment end & gfa.rm(\verb/:links_of,[:S1,"E]/) \\
+ Delete link targets & gfa.rm(\verb/:neighbours,[:S1,"E]/) \\
+ Delete paths of segment & gfa.rm(\verb/:paths_with,:S1/) \\
+ Delete segments contained in s & gfa.rm(\verb/:contained_in,:s/) \\
+ Delete s1-E links except to s2-B & gfa.delete\_other\_links(\verb/[s1,:E],[s2,:B]/)\\
+ \hline
+ Content of headers field & gfa.header.xx \\
+ Replace header field content & gfa.set\_header\_field(:xx, 12, \\
+ & \hspace{4.3cm})\\
+ Append to header field & gfa.set\_header\_field(:xx, 12,\\
+ & \hspace{4.3cm}, existing: :add)\\
+ \hline
+ Sum of read counts & \verb/gfa.segments.map(&:RC).inject(:+)/ \\
+ Highest coverage & \verb/gfa.segments.map(&:coverage).max/ \\
+ Delete low coverage segments & \verb/gfa.rm(gfa.segments.select {|s|/ \\
+ & \hspace{2.7cm}\verb/s.coverage < mincov })/ \\
+ Delete isolated segments & \verb/gfa.rm(gfa.segments.select {|s|/ \\
+ & \hspace{1cm}\verb/gfa.connectivity(s) == [0,0] })/ \\
+ \hline
+ Muliply segment & gfa.multiply("A", 4) \\
+ Detect linear paths & gfa.linear\_paths \\
+ Detect and merge linear paths & gfa.merge\_linear\_paths \\
+ Compute connected components & gfa.connected\_components \\
+ Component of a segment & gfa.segment\_connected\_component(s) \\
+ Split components & gfa.split\_connected\_components \\
+ Number of dead ends & gfa.n\_dead\_ends \\
+ \hline
+ \textit{(with RGFATools only)} & \\
+ Muliply segment, distribute links & gfa.multiply("A", 4) \\
+ Compute copy numbers & gfa.compute\_copy\_numbers \\
+ Apply copy numbers & gfa.apply\_copy\_numbers \\
+ Orient invertible segments & gfa.randomly\_orient\_invertibles \\
+ Enforce mandatory links & gfa.enforce\_mandatory\_links \\
+ Remove p-bubbles & gfa.remove\_p\_bubbles \\
+ Remove small components & gfa.remove\_small\_components(minlen) \\
+ \hline
+ \textit{(Command line tools)} & \\
+ Compare two GFA files & gfadiff.rb 1.gfa 2.gfa \\
+ \ldots only segments and links & gfadiff.rb -s -l 1.gfa 2.gfa \\
+ \ldots output as ruby script & gfadiff.rb -script 1.gfa 2.gfa \\
+ Merge linear paths in graph & simplify.rb 2.gfa > 3.gfa \\
+ \hline
+ \textit{(Experimental command line tools)} & \\
+ Simulate de Bruijn graph & simulate\_debruijn.rb 27 gnm.fas > 1.gfa \\
+ \ldots and find CRISPRs candidates & find\_crisprs.rb 1.gfa \\
+ \hline
+\end{tabular}
+
+\cstableend
+
+\end{document}
+
diff --git a/lib/rgfa.rb b/lib/rgfa.rb
new file mode 100644
index 0000000..bf87ccf
--- /dev/null
+++ b/lib/rgfa.rb
@@ -0,0 +1,376 @@
+# (c) 2016, Giorgio Gonnella, ZBH, Uni-Hamburg <gonnella at zbh.uni-hamburg.de>
+
+# Main class of the RGFA library.
+#
+# RGFA provides a representation of a GFA graph.
+# It supports creating a graph from scratch, input and output from/to file
+# or strings, as well as several operations on the graph.
+# The examples below show how to create a RGFA object from scratch or
+# from a GFA file, write the RGFA to file, output the string representation or
+# a statistics report, and control the validation level.
+#
+# == Interacting with the graph
+#
+# - {RGFA::Lines}: module with methods for finding, editing, iterating over,
+# removing lines belonging to a RGFA instance. Specialized modules exist
+# for each kind of line:
+# - {RGFA::Headers}: accessing and creating header information is done
+# using a single header line object ({headers RGFA#header})
+# - {RGFA::Segments}
+# - {RGFA::Links}
+# - {RGFA::Containments}
+# - {RGFA::Paths}
+#
+# - {RGFA::Line}: most interaction with the GFA involve interacting with
+# its record, i.e. instances of a subclass of this class. Subclasses:
+# - {RGFA::Line::Header}
+# - {RGFA::Line::Segment}
+# - {RGFA::Line::Link}
+# - {RGFA::Line::Containment}
+# - {RGFA::Line::Path}
+#
+# - Further modules contain methods useful for interacting with the graph
+# - {RGFA::Connectivity} analysis of the connectivity of the graph
+# - {RGFA::LinearPaths} finding and merging of linear paths
+# - {RGFA::Multiplication} separation of the implicit instances of a repeat
+#
+# - Additional functionality is provided by {RGFATools}
+#
+# @example Creating an empty RGFA object
+# gfa = RGFA.new
+#
+# @example Parsing and writing GFA format
+# gfa = RGFA.from_file(filename) # parse GFA file
+# gfa.to_file(filename) # write to GFA file
+# puts gfa # show GFA representation of RGFA object
+#
+# @example Basic statistics report
+# puts gfa.info # print report
+# puts gfa.info(short = true) # compact format, in one line
+#
+# @example Validation
+# gfa = RGFA.from_file(filename, validate: 1) # default level is 2
+# gfa.validate = 3 # change validation level
+# gfa.turn_off_validations # equivalent to gfa.validate = 0
+# gfa.validate! # run post-validations (e.g. check segment names in links)
+#
+class RGFA
+end
+
+require_relative "./rgfa/byte_array.rb"
+require_relative "./rgfa/cigar.rb"
+require_relative "./rgfa/connectivity.rb"
+require_relative "./rgfa/containments.rb"
+require_relative "./rgfa/field_array.rb"
+require_relative "./rgfa/field_parser.rb"
+require_relative "./rgfa/field_validator.rb"
+require_relative "./rgfa/field_writer.rb"
+require_relative "./rgfa/multiplication.rb"
+require_relative "./rgfa/headers.rb"
+require_relative "./rgfa/line.rb"
+require_relative "./rgfa/linear_paths.rb"
+require_relative "./rgfa/lines.rb"
+require_relative "./rgfa/links.rb"
+require_relative "./rgfa/logger.rb"
+require_relative "./rgfa/numeric_array.rb"
+require_relative "./rgfa/rgl.rb"
+require_relative "./rgfa/segment_ends_path.rb"
+require_relative "./rgfa/segment_info.rb"
+require_relative "./rgfa/segments.rb"
+require_relative "./rgfa/paths.rb"
+require_relative "./rgfa/sequence.rb"
+
+class RGFA
+
+ include RGFA::Lines
+ include RGFA::Headers
+ include RGFA::Segments
+ include RGFA::Links
+ include RGFA::Containments
+ include RGFA::Paths
+ include RGFA::LinearPaths
+ include RGFA::Connectivity
+ include RGFA::Multiplication
+ include RGFA::LoggerSupport
+ include RGFA::RGL
+
+ attr_accessor :validate
+
+ # @!macro validate
+ # @param validate [Integer] (<i>defaults to: +2+</i>)
+ # the validation level; see "Validation level" under
+ # {RGFA::Line#initialize}.
+ def initialize(validate: 2)
+ @validate = validate
+ init_headers
+ @segments = {}
+ @links = []
+ @containments = []
+ @paths = {}
+ @segments_first_order = false
+ @progress = false
+ @default = {:count_tag => :RC, :unit_length => 1}
+ @extensions_enabled = false
+ end
+
+ # Require that the links, containments and paths referring
+ # to a segment are added after the segment. Default: do not
+ # require any particular ordering.
+ #
+ # @return [void]
+ def require_segments_first_order
+ @segments_first_order = true
+ end
+
+ # Set the validation level to 0.
+ # See "Validation level" under {RGFA::Line#initialize}.
+ # @return [void]
+ def turn_off_validations
+ @validate = 0
+ end
+
+ # List all names of segments in the graph
+ # @return [Array<Symbol>]
+ def segment_names
+ @segments.keys.compact
+ end
+
+ # List all names of path lines in the graph
+ # @return [Array<Symbol>]
+ def path_names
+ @paths.keys.compact
+ end
+
+ # Post-validation of the RGFA
+ # @return [void]
+ # @raise if validation fails
+ def validate!
+ validate_segment_references!
+ validate_path_links!
+ return nil
+ end
+
+ # Creates a string representation of RGFA conforming to the current
+ # specifications
+ # @return [String]
+ def to_s
+ s = ""
+ each_line {|line| s << line.to_s; s << "\n"}
+ return s
+ end
+
+ # Return the gfa itself
+ # @return [self]
+ def to_rgfa
+ self
+ end
+
+ # Create a copy of the RGFA instance.
+ # @return [RGFA]
+ def clone
+ cpy = to_s.to_rgfa(validate: 0)
+ cpy.validate = @validate
+ cpy.enable_progress_logging if @progress
+ cpy.require_segments_first_order if @segments_first_order
+ return cpy
+ end
+
+ # Populates a RGFA instance reading from file with specified +filename+
+ # @param [String] filename
+ # @raise if file cannot be opened for reading
+ # @return [self]
+ def read_file(filename)
+ if @progress
+ linecount = `wc -l #{filename}`.strip.split(" ")[0].to_i
+ progress_log_init(:read_file, "lines", linecount,
+ "Parse file with #{linecount} lines")
+ end
+ File.foreach(filename) do |line|
+ self << line.chomp
+ progress_log(:read_file) if @progress
+ end
+ progress_log_end(:read_file) if @progress
+ validate! if @validate >= 1
+ self
+ end
+
+ # Creates a RGFA instance parsing the file with specified +filename+
+ # @param [String] filename
+ # @raise if file cannot be opened for reading
+ # @!macro validate
+ # @return [RGFA]
+ def self.from_file(filename, validate: 2)
+ gfa = RGFA.new(validate: validate)
+ gfa.read_file(filename)
+ return gfa
+ end
+
+ # Write RGFA to file with specified +filename+;
+ # overwrites it if it exists
+ # @param [String] filename
+ # @raise if file cannot be opened for writing
+ # @return [void]
+ def to_file(filename)
+ File.open(filename, "w") {|f| each_line {|l| f.puts l}}
+ end
+
+ # Output basic statistics about the graph's sequence and topology
+ # information.
+ #
+ # @param [boolean] short compact output as a single text line
+ #
+ # Compact output has the following keys:
+ # - +ns+: number of segments
+ # - +nl+: number of links
+ # - +cc+: number of connected components
+ # - +de+: number of dead ends
+ # - +tl+: total length of segment sequences
+ # - +50+: N50 segment sequence length
+ #
+ # Normal output outputs a table with the same information, plus some
+ # additional one: the length of the largest
+ # component, as well as the shortest and largest and 1st/2nd/3rd quartiles
+ # of segment sequence length.
+ #
+ # @return [String] sequence and topology information collected from the graph.
+ #
+ def info(short = false)
+ q, n50, tlen = lenstats
+ nde = n_dead_ends()
+ pde = "%.2f%%" % ((nde.to_f*100) / (segments.size*2))
+ cc = connected_components()
+ cc.map!{|c|c.map{|sn|segment!(sn).length!}.inject(:+)}
+ if short
+ return "ns=#{segments.size}\t"+
+ "nl=#{links.size}\t"+
+ "cc=#{cc.size}\t"+
+ "de=#{nde}\t"+
+ "tl=#{tlen}\t"+
+ "50=#{n50}"
+ end
+ retval = []
+ retval << "Segment count: #{segments.size}"
+ retval << "Links count: #{links.size}"
+ retval << "Total length (bp): #{tlen}"
+ retval << "Dead ends: #{nde}"
+ retval << "Percentage dead ends: #{pde}"
+ retval << "Connected components: #{cc.size}"
+ retval << "Largest component (bp): #{cc.last}"
+ retval << "N50 (bp): #{n50}"
+ retval << "Shortest segment (bp): #{q[0]}"
+ retval << "Lower quartile segment (bp): #{q[1]}"
+ retval << "Median segment (bp): #{q[2]}"
+ retval << "Upper quartile segment (bp): #{q[3]}"
+ retval << "Longest segment (bp): #{q[4]}"
+ return retval
+ end
+
+ # Counts the dead ends.
+ #
+ # Dead ends are here defined as segment ends without connections.
+ #
+ # @return [Integer] number of dead ends in the graph
+ #
+ def n_dead_ends
+ segments.inject(0) do |n,s|
+ [:E, :B].each {|e| n+= 1 if links_of([s.name, e]).empty?}
+ n
+ end
+ end
+
+ # Compare two RGFA instances.
+ # @return [Boolean] are the lines of the two instances equivalent?
+ def ==(other)
+ segments == other.segments and
+ links == other.links and
+ containments == other.containments and
+ headers == other.headers and
+ paths == other.paths
+ end
+
+ private
+
+ def lenstats
+ sln = segments.map(&:length!).sort
+ n = sln.size
+ tlen = sln.inject(:+)
+ n50 = nil
+ sum = 0
+ sln.reverse.each do |l|
+ sum += l
+ if sum >= tlen/2
+ n50 = l
+ break
+ end
+ end
+ q = [sln[0], sln[(n/4)-1], sln[(n/2)-1], sln[((n*3)/4)-1], sln[-1]]
+ return q, n50, tlen
+ end
+
+ # Checks that L, C and P refer to existing S.
+ # @return [void]
+ # @raise [RGFA::LineMissingError] if validation fails
+ def validate_segment_references!
+ @segments.values.each do |s|
+ if s.virtual?
+ raise RGFA::LineMissingError, "Segment #{s.name} does not exist\n"+
+ "References to #{s.name} were found in the following lines:\n"+
+ s.all_references.map(&:to_s).join("\n")
+ end
+ end
+ return nil
+ end
+
+ # Checks that P are supported by links.
+ # @return [void]
+ # @raise if validation fails
+ def validate_path_links!
+ @paths.values.each do |pt|
+ pt.links.each do |l, dir|
+ if l.virtual?
+ raise RGFA::LineMissingError, "Link: #{l.to_s}\n"+
+ "does not exist, but is required by the paths:\n"+
+ l.paths.map{|pt2, dir2|pt2.to_s}.join("\n")
+ end
+ end
+ end
+ return nil
+ end
+
+ def init_headers
+ @headers = RGFA::Line::Header.new([], validate: @validate)
+ end
+
+end
+
+# Ruby core String class, with additional methods.
+class String
+
+ # Converts a +String+ into a +RGFA+ instance. Each line of the string is added
+ # separately to the gfa.
+ # @return [RGFA]
+ # @!macro validate
+ def to_rgfa(validate: 2)
+ gfa = RGFA.new(validate: validate)
+ split("\n").each {|line| gfa << line}
+ gfa.validate! if validate >= 1
+ return gfa
+ end
+
+end
+
+# Ruby core Array class, with additional methods.
+class Array
+
+ # Converts an +Array+ of strings or RGFA::Line instances
+ # into a +RGFA+ instance.
+ # @return [RGFA]
+ # @!macro validate
+ def to_rgfa(validate: 2)
+ gfa = RGFA.new(validate: validate)
+ each {|line| gfa << line}
+ gfa.validate! if validate >= 1
+ return gfa
+ end
+
+end
diff --git a/lib/rgfa/byte_array.rb b/lib/rgfa/byte_array.rb
new file mode 100644
index 0000000..8260927
--- /dev/null
+++ b/lib/rgfa/byte_array.rb
@@ -0,0 +1,74 @@
+require_relative "error.rb"
+
+#
+# Array of positive integers <= 255;
+# representation of the data contained in an H field
+#
+class RGFA::ByteArray < Array
+
+ # Validates the byte array content
+ # @raise [RGFA::ByteArray::ValueError] if any value is not a
+ # positive integer <= 255
+ # @return [void]
+ def validate!
+ each do |x|
+ unless x.kind_of?(Integer) and (0..255).include?(x)
+ raise RGFA::ByteArray::ValueError,
+ "Value incompatible with byte array: #{x.inspect}\n"+
+ "in array: #{self.inspect}"
+ end
+ end
+ self.trust
+ return nil
+ end
+
+ # Returns self
+ # @return [RGFA::ByteArray] self
+ def to_byte_array
+ self
+ end
+
+ # GFA datatype H representation of the byte array
+ # @raise [RGFA::ByteArray::ValueError] if the
+ # array is not a valid byte array
+ # @return [String]
+ def to_s
+ validate!
+ map do |elem|
+ str = elem.to_s(16).upcase
+ elem < 16 ? "0#{str}" : str
+ end.join
+ end
+
+end
+
+# Exception raised if any value is not a positive integer <= 255
+class RGFA::ByteArray::ValueError < RGFA::Error; end
+
+# Exception raised if string is not a valid representation of byte array
+class RGFA::ByteArray::FormatError < RGFA::Error; end
+
+# Method to create a RGFA::ByteArray from an Array
+class Array
+ # Create a RGFA::ByteArray from an Array instance
+ # @return [RGFA::ByteArray] the byte array
+ def to_byte_array
+ RGFA::ByteArray.new(self)
+ end
+end
+
+# Method to parse the string representation of a RGFA::ByteArray
+class String
+ # Convert a GFA string representation of a byte array to a byte array
+ # @return [RGFA::ByteArray] the byte array
+ # @raise [RGFA::ByteArray::FormatError] if the string size is not > 0
+ # and even
+ def to_byte_array
+ if (size < 2) or (size % 2 == 1)
+ raise RGFA::ByteArray::FormatError,
+ "Invalid byte array string #{self}; "+
+ "each element must be represented by two letters [0-9A-F]"
+ end
+ scan(/..?/).map {|x|Integer(x,16)}.to_byte_array
+ end
+end
diff --git a/lib/rgfa/cigar.rb b/lib/rgfa/cigar.rb
new file mode 100644
index 0000000..4d07b7a
--- /dev/null
+++ b/lib/rgfa/cigar.rb
@@ -0,0 +1,156 @@
+require_relative "error.rb"
+
+# Array of {RGFA::CIGAR::Operation CIGAR operations}.
+# Represents the contents of a CIGAR string.
+class RGFA::CIGAR < Array
+
+ # Compute the CIGAR for the segments in reverse direction.
+ #
+ # @example Reversing a CIGAR
+ #
+ # RGFA::CIGAR.from_string("2M1D3M").reverse.to_s
+ # # => "3M1I2M"
+ #
+ # # S1 + S2 + 2M1D3M
+ # #
+ # # S1+ ACGACTGTGA
+ # # S2+ CT-TGACGG
+ # #
+ # # S2- CCGTCA-AG
+ # # S1- TCACAGTCGT
+ # #
+ # # S2 - S1 - 3M1I2M
+ #
+ # @return [RGFA::CIGAR] (empty if CIGAR string is *)
+ def reverse
+ super.map do |op|
+ if op.code == :I
+ op.code = :D
+ elsif op.code == :D
+ op.code = :I
+ end
+ op
+ end
+ end
+
+ # Parse a CIGAR string into an array of CIGAR operations.
+ #
+ # Each operation is represented by a {RGFA::CIGAR::Operation},
+ # i.e. a tuple of operation length and operation
+ # symbol (one of MIDNSHPX=).
+ #
+ # @return [RGFA::CIGAR] (empty if string is *)
+ # @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
+ def self.from_string(str)
+ a = RGFA::CIGAR.new
+ if str != "*"
+ raise RGFA::CIGAR::ValueError if str !~ /^([0-9]+[MIDNSHPX=])+$/
+ str.scan(/[0-9]+[MIDNSHPX=]/).each do |op|
+ len = op[0..-2].to_i
+ code = op[-1..-1].to_sym
+ a << RGFA::CIGAR::Operation.new(len, code)
+ end
+ end
+ return a
+ end
+
+ # String representation of the CIGAR
+ # @return [String] CIGAR string
+ def to_s
+ if empty?
+ return "*"
+ else
+ map(&:to_s).join
+ end
+ end
+
+ # Validate the instance
+ # @raise if any component of the CIGAR array is invalid.
+ # @return [void]
+ def validate!
+ any? do |op|
+ op.to_cigar_operation.validate!
+ end
+ end
+
+ # @return [RGFA::CIGAR] self
+ def to_cigar
+ self
+ end
+
+ # Create a copy
+ # @return [RGFA::CIGAR]
+ def clone
+ map{|x|x.clone}
+ end
+
+end
+
+# Exception raised by invalid CIGAR string content
+class RGFA::CIGAR::ValueError < RGFA::Error; end
+
+# An operation in a CIGAR string
+class RGFA::CIGAR::Operation
+ attr_accessor :len
+ attr_accessor :code
+
+ CODE = [:M, :I, :D, :N, :S, :H, :P, :X, :"="]
+
+ # @param len [Integer] length of the operation
+ # @param code [RGFA::CIGAR::Operation::CODE] code of the operation
+ def initialize(len, code)
+ @len = len
+ @code = code
+ end
+
+ # The string representation of the operation
+ # @return [String]
+ def to_s
+ "#{len}#{code}"
+ end
+
+ # Compare two operations
+ # @return [Boolean]
+ def ==(other)
+ other.len == len and other.code == code
+ end
+
+ # Validate the operation
+ # @return [void]
+ # @raise [RGFA::CIGAR::ValueError] if the code is invalid or the length is not
+ # an integer larger than zero
+ def validate!
+ if Integer(len) <= 0 or
+ !RGFA::CIGAR::Operation::CODE.include?(code)
+ raise RGFA::CIGAR::ValueError
+ end
+ end
+
+ # @return [RGFA::CIGAR::Operation] self
+ def to_cigar_operation
+ self
+ end
+end
+
+class Array
+ # Create a {RGFA::CIGAR} instance from the content of the array.
+ # @return [RGFA::CIGAR]
+ def to_cigar
+ RGFA::CIGAR.new(self)
+ end
+ # Create a {RGFA::CIGAR::Operation} instance from the content of the array.
+ # @return [RGFA::CIGAR::Operation]
+ def to_cigar_operation
+ RGFA::CIGAR::Operation.new(Integer(self[0]), self[1].to_sym)
+ end
+end
+
+class String
+ # Parse CIGAR string and return an array of CIGAR operations
+ # @return [RGFA::CIGAR] CIGAR operations (empty if string is "*")
+ # @raise [RGFA::CIGAR::ValueError] if the string is not a valid CIGAR string
+ def to_cigar
+ RGFA::CIGAR.from_string(self)
+ end
+end
+
diff --git a/lib/rgfa/connectivity.rb b/lib/rgfa/connectivity.rb
new file mode 100644
index 0000000..55a0368
--- /dev/null
+++ b/lib/rgfa/connectivity.rb
@@ -0,0 +1,131 @@
+#
+# Methods which analyse the connectivity of the graph.
+#
+module RGFA::Connectivity
+
+ require "set"
+
+ # Computes the connectivity of a segment from its number of links.
+ #
+ # @param segment [String|RGFA::Line::Segment] segment name or instance
+ #
+ # @return [Array<conn_symbol,conn_symbol>]
+ # conn. symbols respectively of the :B and :E ends of +segment+.
+ #
+ # <b>Connectivity symbol:</b> (+conn_symbol+)
+ # - Let _n_ be the number of links to an end (+:B+ or +:E+) of a segment.
+ # Then the connectivity symbol is +:M+ if <i>n > 1</i>, otherwise _n_.
+ #
+ def connectivity(segment)
+ connectivity_symbols(links_of([segment, :B]).size,
+ links_of([segment, :E]).size)
+ end
+
+ # Does the removal of the link alone divide a component
+ # of the graph into two?
+ # @return [Boolean]
+ # @param link [RGFA::Line::Link] a link
+ def cut_link?(link)
+ return false if link.circular?
+ return true if links_of(link.from_end.invert_end_type).size == 0
+ return true if links_of(link.to_end.invert_end_type).size == 0
+ c = {}
+ [:from, :to].each do |et|
+ c[et] = Set.new
+ visited = Set.new
+ segend = link.send(:"#{et}_end")
+ visited << segend.name
+ visited << link.other_end(segend).name
+ traverse_component(segend, c[et], visited)
+ end
+ return c[:from] != c[:to]
+ end
+
+ # Does the removal of the segment and its links divide a
+ # component of the graph into two?
+ # @param segment [String, RGFA::Line::Segment] a segment name or instance
+ # @return [Boolean]
+ def cut_segment?(segment)
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
+ cn = connectivity(segment_name)
+ return false if [[0,0],[0,1],[1,0]].include?(cn)
+ start_points = []
+ [:B, :E].each do |et|
+ start_points += links_of([segment_name, et]).map do |l|
+ l.other_end([segment_name, et]).invert_end_type
+ end
+ end
+ cc = []
+ start_points.uniq.each do |start_point|
+ cc << Set.new
+ visited = Set.new
+ visited << segment_name
+ traverse_component(start_point, cc.last, visited)
+ end
+ return cc.any?{|c|c != cc[0]}
+ end
+
+ # Find the connected component of the graph in which a segment is included
+ # @return [Array<String>]
+ # array of segment names
+ # @param segment [String, RGFA::Line::Segment] a segment name or instance
+ # @param visited [Set<String>] a set of segments to ignore during graph
+ # traversal; all segments in the found component will be added to it
+ def segment_connected_component(segment, visited = Set.new)
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
+ visited << segment_name
+ c = [segment_name]
+ traverse_component([segment_name, :B], c, visited)
+ traverse_component([segment_name, :E], c, visited)
+ return c
+ end
+
+ # Find the connected components of the graph
+ # @return [Array<Array<String>>]
+ # array of components, each an array of segment names
+ def connected_components
+ components = []
+ visited = Set.new
+ segment_names.each do |sn|
+ next if visited.include?(sn)
+ components << segment_connected_component(sn, visited)
+ end
+ return components
+ end
+
+ # Split connected components of the graph into single-component RGFAs
+ # @return [Array<RGFA>]
+ def split_connected_components
+ retval = []
+ ccs = connected_components
+ ccs.each do |cc|
+ gfa2 = self.clone
+ gfa2.rm(gfa2.segment_names - cc)
+ retval << gfa2
+ end
+ return retval
+ end
+
+ private
+
+ def traverse_component(segment_end, c, visited)
+ links_of(segment_end).each do |l|
+ oe = l.other_end(segment_end)
+ sn = oe.name
+ next if visited.include?(sn)
+ visited << sn
+ c << sn
+ traverse_component([sn, :B], c, visited)
+ traverse_component([sn, :E], c, visited)
+ end
+ end
+
+ def connectivity_symbols(n,m)
+ [connectivity_symbol(n), connectivity_symbol(m)]
+ end
+
+ def connectivity_symbol(n)
+ n > 1 ? :M : n
+ end
+
+end
diff --git a/lib/rgfa/containments.rb b/lib/rgfa/containments.rb
new file mode 100644
index 0000000..60f2430
--- /dev/null
+++ b/lib/rgfa/containments.rb
@@ -0,0 +1,97 @@
+require_relative "error"
+
+#
+# Methods for the RGFA class, which allow to handle containments in the graph.
+#
+module RGFA::Containments
+
+ def add_containment(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ @containments << gfa_line
+ [:from, :to].each do |dir|
+ segment_name = gfa_line.send(dir)
+ orient = gfa_line.send(:"#{dir}_orient")
+ if !@segments.has_key?(segment_name)
+ raise RGFA::LineMissingError if @segments_first_order
+ @segments[segment_name] =
+ RGFA::Line::Segment.new({:name => segment_name},
+ virtual: true)
+ end
+ s = @segments[segment_name]
+ s.containments[dir][orient] << gfa_line
+ gfa_line.send(:"#{dir}=", s)
+ end
+ end
+ protected :add_containment
+
+ # Delete a containment
+ #
+ # @param c [RGFA::Line::Containment] containment instance
+ # @return [RGFA] self
+ def delete_containment(c)
+ @containments.delete(c)
+ segment(c.from).containments[:from][c.from_orient].delete(c)
+ segment(c.to).containments[:to][c.to_orient].delete(c)
+ end
+
+ # All containments in the graph
+ # @return [Array<RGFA::Line::Containment>]
+ def containments
+ @containments
+ end
+
+ # Find containment lines whose +from+ segment name is +segment_name+
+ # @!macro segment_or_name
+ # @return [Array<RGFA::Line::Containment>]
+ def contained_in(s)
+ s = segment!(s)
+ s.containments[:from][:+] + s.containments[:from][:-]
+ end
+
+ # Find containment lines whose +to+ segment name is +segment_name+
+ # @return [Array<RGFA::Line::Containment>]
+ # @!macro segment_or_name
+ def containing(s)
+ s = segment!(s)
+ s.containments[:to][:+] + s.containments[:to][:-]
+ end
+
+ # Searches all containments of +contained+ in +container+.
+ # Returns a possibly empty array of containments.
+ #
+ # @return [Array<RGFA::Line::Containment>]
+ # @!macro [new] container_contained
+ # @param container [RGFA::Line::Segment, Symbol] a segment instance or name
+ # @param contained [RGFA::Line::Segment, Symbol] a segment instance or name
+ #
+ def containments_between(container, contained)
+ contained_in(container).select {|l| l.to.to_sym == contained.to_sym }
+ end
+
+ # Searches a containment of +contained+ in +container+.
+ # Returns the first containment found or nil if none found.
+ #
+ # @return [RGFA::Line::Containment, nil]
+ # @!macro container_contained
+ def containment(container, contained)
+ contained_in(container).each do |l|
+ if l.to.to_sym == contained.to_sym
+ return l
+ end
+ end
+ return nil
+ end
+
+ # Searches a containment of +contained+ in +container+.
+ # Raises an exception if no such containment was found.
+ #
+ # @return [RGFA::Line::Containment]
+ # @raise [RGFA::LineMissingError] if no such containment found
+ # @!macro container_contained
+ def containment!(container, contained)
+ c = containment(container, contained)
+ raise RGFA::LineMissingError, "No containment was found" if c.nil?
+ c
+ end
+
+end
diff --git a/lib/rgfa/error.rb b/lib/rgfa/error.rb
new file mode 100644
index 0000000..3f12671
--- /dev/null
+++ b/lib/rgfa/error.rb
@@ -0,0 +1,3 @@
+# Parent class for library-specific errors
+class RGFA::Error < StandardError; end
+
diff --git a/lib/rgfa/field_array.rb b/lib/rgfa/field_array.rb
new file mode 100644
index 0000000..5b50d42
--- /dev/null
+++ b/lib/rgfa/field_array.rb
@@ -0,0 +1,87 @@
+# Array representing multiple values of the same tag in different header lines
+class RGFA::FieldArray < Array
+ attr_reader :datatype
+
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] the datatype to use
+ def initialize(datatype, data = [])
+ @datatype = datatype
+ super(data)
+ end
+
+ # Run a datatype-specific validation on each element of the array
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE]
+ def validate_gfa_field!(datatype, fieldname=nil)
+ each.validate_gfa_field!(@datatype, fieldname)
+ end
+
+ # Default datatype, in this case :J
+ # @api private
+ def default_gfa_datatype
+ :J
+ end
+
+ # Representation of the field array as JSON array, with
+ # two additional values: the datatype and a zero byte as "signature".
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] (ignored, J is always used)
+ # @api private
+ def to_gfa_field(datatype: nil)
+ self << @datatype
+ self << "\0"
+ to_json
+ end
+
+ # Add a value to the array and validate
+ # @raise [RGFA::FieldArray::TypeMismatchError] if the type
+ # of the new value does not correspond to the type of
+ # existing values
+ # @param value [Object] the value to add
+ # @param type [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use;
+ # if not +nil+, it will be checked that the specified datatype is the
+ # same as for previous elements of the field array;
+ # if +nil+, the value will be validated, according to the datatype
+ # specified on field array creation
+ # @param fieldname [Symbol] the field name to use for error messages
+ #
+ def push_with_validation(value, type, fieldname=nil)
+ if type.nil?
+ value.validate_gfa_field!(@datatype, fieldname)
+ elsif type != @datatype
+ raise RGFA::FieldArray::TypeMismatchError,
+ "Datatype mismatch error for field #{fieldname}:\n"+
+ "value: #{value}\n"+
+ "existing datatype: #{@datatype};\n"+
+ "new datatype: #{type}"
+ end
+ self << value
+ end
+end
+
+# Generic error associated with field arrays
+class RGFA::FieldArray::Error < RGFA::Error; end
+
+# Error raised when trying to add elements with a wrong datatype
+class RGFA::FieldArray::TypeMismatchError < RGFA::Error; end
+
+class Array
+ # Is this possibly a {RGFA::FieldArray} instance?
+ #
+ # (i.e. are the two last elements a datatype symbol
+ # and a zero byte?)
+ # @return [Boolean]
+ def rgfa_field_array?
+ self[-1] == "\0" and
+ RGFA::Line::OPTFIELD_DATATYPE.include?(self[-2].to_sym)
+ end
+
+ # Create a {RGFA::FieldArray} from an array
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use
+ def to_rgfa_field_array(datatype=nil)
+ if self.rgfa_field_array?
+ RGFA::FieldArray.new(self[-2].to_sym, self[0..-3])
+ elsif datatype.nil?
+ raise RGFA::FieldArray::Error, "no datatype specified"
+ else
+ RGFA::FieldArray.new(datatype, self)
+ end
+ end
+end
diff --git a/lib/rgfa/field_parser.rb b/lib/rgfa/field_parser.rb
new file mode 100644
index 0000000..14ef2dc
--- /dev/null
+++ b/lib/rgfa/field_parser.rb
@@ -0,0 +1,109 @@
+require "json"
+require_relative "byte_array"
+require_relative "numeric_array"
+require_relative "cigar"
+require_relative "error"
+require_relative "field_array"
+
+#
+# Methods to parse the string representations of the GFA fields
+# @api private
+#
+module RGFA::FieldParser
+
+ # Parse a string representation of a GFA field value
+ # @raise [RGFA::Error] if the value is not valid
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
+ def parse_gfa_field(datatype: nil,
+ validate_strings: true,
+ fieldname: nil,
+ frozen: false)
+ case datatype
+ when :A, :Z, :seq
+ validate_gfa_field!(datatype, fieldname: fieldname) if validate_strings
+ self.freeze if frozen
+ return self
+ when :lbl, :orn
+ validate_gfa_field!(datatype, fieldname: fieldname) if validate_strings
+ return to_sym.freeze
+ when :i
+ return Integer(self)
+ when :pos
+ value = Integer(self)
+ raise RGFA::FieldParser::FormatError if value < 0
+ return value
+ when :f
+ return Float(self)
+ when :H
+ value = to_byte_array
+ value.freeze if frozen
+ return value
+ when :B
+ value = to_numeric_array
+ value.freeze if frozen
+ return value
+ when :J
+ value = JSON.parse(self)
+ # RGFA convention for array of fields
+ if value.kind_of?(Array) and value.rgfa_field_array?
+ value = value.to_rgfa_field_array
+ end
+ # no need to freeze, as any Hash or Array will be valid
+ return value
+ when :cig
+ value = to_cigar
+ value.freeze if frozen
+ return value
+ when :cgs
+ value = split(",").map do |c|
+ c = c.to_cigar
+ c.freeze if frozen
+ c
+ end
+ value.freeze if frozen
+ return value
+ when :lbs
+ value = split(",").map do |l|
+ o = l[-1].to_sym
+ l = l[0..-2]
+ if validate_strings
+ l.validate_gfa_field!(:lbl, fieldname: "#{fieldname} "+
+ "(entire field content: #{self})" )
+ end
+ os = [l.to_sym, o].to_oriented_segment
+ os.freeze if frozen
+ os
+ end
+ value.freeze if frozen
+ return value
+ else
+ raise RGFA::FieldParser::UnknownDatatypeError,
+ "Datatype unknown: #{datatype.inspect}"
+ end
+ end
+
+ # Parses an optional field in the form tagname:datatype:value
+ # and parses the value according to the datatype
+ # @raise [RGFA::FieldParser::FormatError] if the string does not represent
+ # an optional field
+ # @return [Array(Symbol, RGFA::Line::FIELD_DATATYPE, String)]
+ # the parsed content of the field
+ def parse_gfa_optfield
+ if self =~ /^([A-Za-z][A-Za-z0-9]):([AifZJHB]):(.+)$/
+ return $1.to_sym, $2.to_sym, $3
+ else
+ raise RGFA::FieldParser::FormatError,
+ "Expected optional field, found: #{self.inspect}"
+ end
+ end
+end
+
+# Error raised if the field content has an invalid format
+class RGFA::FieldParser::FormatError < RGFA::Error; end
+
+# Error raised if an unknown datatype symbol is used
+class RGFA::FieldParser::UnknownDatatypeError < RGFA::Error; end
+
+class String
+ include RGFA::FieldParser
+end
diff --git a/lib/rgfa/field_validator.rb b/lib/rgfa/field_validator.rb
new file mode 100644
index 0000000..3890e68
--- /dev/null
+++ b/lib/rgfa/field_validator.rb
@@ -0,0 +1,241 @@
+require_relative "field_parser"
+require_relative "line"
+
+#
+# Methods to validate the string representations of the GFA fields data
+# @api private
+#
+module RGFA::FieldValidator
+
+ # Validation regular expressions, derived from the GFA specification
+ DATASTRING_VALIDATION_REGEXP = {
+ :A => /^[!-~]$/, # Printable character
+ :i => /^[-+]?[0-9]+$/, # Signed integer
+ :f => /^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$/,
+ # Single-precision floating number
+ :Z => /^[ !-~]+$/, # Printable string, including space
+ :J => /^[ !-~]+$/, # JSON, excluding new-line and tab characters
+ :H => /^[0-9A-F]+$/, # Byte array in the Hex format
+ :B => /^[cCsSiIf](,[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?)+$/,
+ # Integer or numeric array
+ :lbl => /^[!-)+-<>-~][!-~]*$/, # segment/path label
+ :orn => /^\+|-$/, # segment orientation
+ :lbs => /^[!-)+-<>-~][!-~]*[+-](,[!-)+-<>-~][!-~]*[+-])+$/,
+ # multiple labels with orientations, comma-sep
+ :seq => /^\*$|^[A-Za-z=.]+$/, # nucleotide sequence
+ :pos => /^[0-9]*$/, # positive integer
+ :cig => /^(\*|(([0-9]+[MIDNSHPX=])+))$/, # CIGAR string
+ :cgs => /^(\*|(([0-9]+[MIDNSHPX=])+))(,(\*|(([0-9]+[MIDNSHPX=])+)))*$/,
+ # multiple CIGARs, comma-sep
+ }
+
+ # Validates the string according to the provided datatype
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
+ # @param fieldname [#to_s] Fieldname to use in the error msg
+ # @raise [RGFA::FieldParser::FormatError] if the string does not match
+ # the regexp for the provided datatype
+ # @return [void]
+ # @api private
+ def validate_gfa_field!(datatype, fieldname=nil)
+ regexp = DATASTRING_VALIDATION_REGEXP[datatype]
+ raise RGFA::FieldParser::UnknownDatatypeError if regexp.nil?
+ if (regexp !~ self)
+ fieldname ||= "Value"
+ raise RGFA::FieldParser::FormatError,
+ "Wrong format for field #{fieldname}: \n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}\n"+
+ "Expected regex: #{regexp}\n"
+ end
+ end
+
+end
+
+class String
+ include RGFA::FieldValidator
+end
+
+class Object
+ # @!macro [new] validate_gfa_field
+ # Validates the object according to the provided datatype
+ # @param datatype [RGFA::Line::FIELD_DATATYPE]
+ # @param fieldname [#to_s] Fieldname to use in the error msg
+ # @raise [RGFA::FieldParser::FormatError] if the object type or content
+ # is not compatible to the provided datatype
+ # @return [void]
+ # @api private
+ def validate_gfa_field!(datatype, fieldname=nil)
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+end
+
+class Symbol
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :lbl and datatype != :orn and datatype != :Z
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ self.to_s.validate_gfa_field!(datatype)
+ end
+end
+
+class Hash
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :J
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ end
+end
+
+class Array
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ begin
+ case datatype
+ when :J
+ return
+ when :Z
+ return
+ when :lbs
+ map!(&:to_oriented_segment).each(&:validate!)
+ return
+ when :cig
+ to_cigar.validate!
+ return
+ when :cgs
+ map(&:to_cigar).each(&:validate!)
+ return
+ when :B
+ to_numeric_array.validate!
+ return
+ when :H
+ to_byte_array.validate!
+ return
+ end
+ rescue => err
+ raise RGFA::FieldParser::FormatError,
+ "Invalid content for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}\n"+
+ "Error: #{err}"
+ end
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+end
+
+class RGFA::ByteArray
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :H
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ begin
+ validate!
+ rescue => err
+ raise RGFA::FieldParser::FormatError,
+ "Invalid content for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}\n"+
+ "Error: #{err}"
+ end
+ end
+end
+
+class RGFA::CIGAR
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :cig
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ begin
+ validate!
+ rescue => err
+ raise RGFA::FieldParser::FormatError,
+ "Invalid content for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}\n"+
+ "Error: #{err}"
+ end
+ end
+end
+
+class RGFA::NumericArray
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :B
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ begin
+ validate!
+ rescue => err
+ raise RGFA::FieldParser::FormatError,
+ "Invalid content for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}\n"+
+ "Error: #{err}"
+ end
+ end
+end
+
+class Float
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :f and datatype != :Z
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ end
+end
+
+class Fixnum
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if (datatype == :pos and self < 0)
+ raise RGFA::FieldParser::FormatError,
+ "Invalid content for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ elsif ![:i, :f, :Z].include?(datatype)
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: #{self.inspect}\n"+
+ "Datatype: #{datatype}"
+ end
+ end
+end
+
+class RGFA::Line::Segment
+ # @!macro validate_gfa_field
+ def validate_gfa_field!(datatype, fieldname=nil)
+ if datatype != :lbl
+ raise RGFA::FieldParser::FormatError,
+ "Wrong type (#{self.class}) for field #{fieldname}\n"+
+ "Content: <RGFA::Segment:#{self.to_s}>\n"+
+ "Datatype: #{datatype}"
+ end
+ end
+end
diff --git a/lib/rgfa/field_writer.rb b/lib/rgfa/field_writer.rb
new file mode 100644
index 0000000..d54c63b
--- /dev/null
+++ b/lib/rgfa/field_writer.rb
@@ -0,0 +1,108 @@
+require "json"
+require_relative "byte_array"
+require_relative "numeric_array"
+require_relative "line"
+
+#
+# Methods to convert ruby objects to the GFA string representations
+# @api private
+#
+# The default conversion is implemented in this module, which is included in
+# Object; single classes may overwrite the following methods, if necessary:
+# - {#default_gfa_datatype}, which returns the symbol of the optional
+# field GFA datatype to use, if none is specified
+# (See RGFA::Line::FIELD_DATATYPE); the default is :Z
+# - {#to_gfa_field} should return a GFA string representation,
+# eventually depending on the specified datatype; no validation is done;
+# the default is #to_s
+#
+module RGFA::FieldWriter
+
+ # @!macro [new] to_gfa_field
+ # Representation of the data for GFA fields; this method
+ # does not (in general) validate the string. The method
+ # can be overwritten for a given class, and may take
+ # the {#default_gfa_datatype} into consideration.
+ # @return [String]
+ # @api private
+ def to_gfa_field(datatype: nil); to_s; end
+
+ # Representation of the data as an optional field
+ # @param fieldname [Symbol] the tag name
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE] (<i>defaults to: the value
+ # returned by {#default_gfa_datatype}</i>)
+ # @api private
+ def to_gfa_optfield(fieldname, datatype: default_gfa_datatype)
+ return "#{fieldname}:#{datatype}:#{to_gfa_field(datatype: datatype)}"
+ end
+
+ # @!macro [new] gfa_datatype
+ # Optional field GFA datatype to use, if none is provided
+ # @return [RGFA::Line::FIELD_DATATYPE]
+ # @api private
+ def default_gfa_datatype; :Z; end
+end
+
+class Object
+ include RGFA::FieldWriter
+end
+
+class Fixnum
+ # @!macro gfa_datatype
+ def default_gfa_datatype; :i; end
+end
+
+class Float
+ # @!macro gfa_datatype
+ def default_gfa_datatype; :f; end
+end
+
+class Hash
+ # @!macro to_gfa_field
+ def to_gfa_field(datatype: nil); to_json; end
+
+ # @!macro gfa_datatype
+ def default_gfa_datatype; :J; end
+end
+
+class Array
+ # @!macro to_gfa_field
+ def to_gfa_field(datatype: default_gfa_datatype)
+ case datatype
+ when :B
+ to_numeric_array.to_s
+ when :J
+ to_json
+ when :cig
+ to_cigar.to_s
+ when :cgs
+ map{|cig|cig.to_cigar.to_s}.join(",")
+ when :lbs
+ map{|os|os.to_oriented_segment.to_s}.join(",")
+ when :H
+ to_byte_array.to_s
+ else
+ map(&:to_s).join(",")
+ end
+ end
+
+ # @!macro gfa_datatype
+ def default_gfa_datatype
+ (all?{|i|i.kind_of?(Integer)} or all?{|i|i.kind_of?(Float)}) ? :B : :J
+ end
+end
+
+class RGFA::ByteArray
+ # @!macro gfa_datatype
+ def default_gfa_datatype; :H; end
+end
+
+class RGFA::NumericArray
+ # @!macro gfa_datatype
+ def default_gfa_datatype; :B; end
+end
+
+class RGFA::Line::Segment
+ # @!macro to_gfa_field
+ def to_gfa_field(datatype: nil); to_sym.to_s; end
+end
diff --git a/lib/rgfa/headers.rb b/lib/rgfa/headers.rb
new file mode 100644
index 0000000..15c0661
--- /dev/null
+++ b/lib/rgfa/headers.rb
@@ -0,0 +1,76 @@
+require_relative "error"
+require_relative "field_array"
+
+# Methods for accessing the GFA header information.
+#
+# The GFA header is accessed using {#header RGFA#header},
+# which returns a {RGFA::Line::Header} object.
+#
+# @example Accessing the header information
+# rgfa.header.VN # => “1.0”
+# rgfa.header.co = “This the header comment”
+# rgfa.header.ni = 100
+# rgfa.header.field_to_s(:ni) # => “ni:i:100”
+#
+# == Multiple header lines defining the same tag
+#
+# The specification does not explicitely forbid to have the same tag on
+# different lines. To represent this case, a "field array"
+# ({RGFA::FieldArray RGFA::FieldArray}) is used, which is an array of
+# instances of a tag, from different lines of the header.
+#
+# @example Header with tags repeated on different lines (see {RGFA::FieldArray})
+# rgfa.header.ni # => RGFA::FieldArray<[100,200] @datatype: :i>
+# rgfa.header.ni[0] # 100
+# rgfa.header.ni << 200 # “200” is also OK
+# rgfa.header.ni.map!{|i|i-10}
+# rgfa.header.ni = [100,200,300].to_rgfa_field_array
+#
+# @example Adding instances of a tag (will go on different header lines)
+# rgfa.header.add(:xx, 100) # => 100 # single i tag, if .xx did not exist yet
+# rgfa.header.add(:xx, 100) # => RGFA::FieldArray<[100,100] @datatype: :i>
+# rgfa.header.add(:xx, 100) # => RGFA::FieldArray<[100,100,100] @datatype :i>
+#
+module RGFA::Headers
+
+ # @return [RGFA::Line::Header] an header line representing the entire header
+ # information; if multiple header line were present, and they contain the
+ # same tag, the tag value is represented by a {RGFA::FieldArray}
+ def header
+ @headers
+ end
+
+ # Header information in single-tag-lines.
+ #
+ # Returns an array of RGFA::Line::Header
+ # objects, each containing a single field of the header.
+ # @!macro readonly
+ # @note Read-only! The returned array containes copies of the original
+ # values, i.e.\ changes in the lines will not affect the RGFA object; to
+ # update the values in the RGFA use the #header method.
+ # @return [Array<RGFA::Line::Header>]
+ # @api private
+ def headers
+ @headers.split
+ end
+
+ # Remove all information from the header.
+ # @return [RGFA] self
+ # @api private
+ def delete_headers
+ init_headers
+ return self
+ end
+
+ protected
+
+ # Add a GFA line to the header. This is useful for constructing the graph.
+ # For adding values to the header, see #header.
+ # @param gfa_line [String, RGFA::Line::Header] a string representing a valid
+ # header line, or a RGFA header line object
+ def add_header(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ @headers.merge(gfa_line)
+ end
+
+end
diff --git a/lib/rgfa/line.rb b/lib/rgfa/line.rb
new file mode 100644
index 0000000..287d315
--- /dev/null
+++ b/lib/rgfa/line.rb
@@ -0,0 +1,721 @@
+require "set"
+#
+# Generic representation of a record of a RGFA file.
+#
+# @!macro [new] rgfa_line
+# @note
+# This class is usually not meant to be directly initialized by the user;
+# initialize instead one of its child classes, which define the concrete
+# different record types.
+#
+class RGFA::Line
+
+ # Separator in the string representation of RGFA lines
+ SEPARATOR = "\t"
+
+ # List of allowed record_type values
+ RECORD_TYPES = [ :H, :S, :L, :C, :P ]
+
+ # Full name of the record types
+ RECORD_TYPE_LABELS = {
+ :H => "header",
+ :S => "segment",
+ :L => "link",
+ :C => "containment",
+ :P => "path",
+ }
+
+ # A symbol representing a datatype for optional fields
+ OPTFIELD_DATATYPE = [:A, :i, :f, :Z, :J, :H, :B]
+
+ # A symbol representing a datatype for required fields
+ REQFIELD_DATATYPE = [:lbl, :orn, :lbs, :seq, :pos, :cig, :cgs]
+
+ # A symbol representing a valid datatype
+ FIELD_DATATYPE = OPTFIELD_DATATYPE + REQFIELD_DATATYPE
+
+ # List of data types which are parsed only on access;
+ # all other are parsed when read.
+ DELAYED_PARSING_DATATYPES = [:cig, :cgs, :lbs, :H, :J, :B]
+
+ # Direction of a segment for links/containments
+ DIRECTION = [:from, :to]
+
+ # Orientation of segments in paths/links/containments
+ ORIENTATION = [:+, :-]
+
+ # @!macro rgfa_line
+ #
+ # @param data [Array<String>] the content of the line; if
+ # an array of strings, this is interpreted as the splitted content
+ # of a GFA file line; note: an hash
+ # is also allowed, but this is for internal usage and shall be considered
+ # private
+ # @param validate [Integer] see paragraph Validation
+ # @param virtual [Boolean] <i>(default: +false+)</i>
+ # mark the line as virtual, i.e. not yet found in the GFA file;
+ # e.g. a link is allowed to refer to a segment which is not
+ # yet created; in this case a segment marked as virtual is created,
+ # which is replaced by a non-virtual segment, when the segment
+ # line is later found
+ #
+ # <b> Constants defined by subclasses </b>
+ #
+ # Subclasses of RGFA::Line _must_ define the following constants:
+ # - RECORD_TYPE [RGFA::Line::RECORD_TYPES]
+ # - REQFIELDS [Array<Symbol>] required fields
+ # - PREDEFINED_OPTFIELDS [Array<Symbol>] predefined optional fields
+ # - DATATYPE [Hash{Symbol=>Symbol}]:
+ # datatypes for the required fields and the predefined optional fields
+ #
+ # @raise [RGFA::Line::RequiredFieldMissingError]
+ # if too less required fields are specified
+ # @raise [RGFA::Line::CustomOptfieldNameError]
+ # if a non-predefined optional field uses upcase letters
+ # @raise [RGFA::Line::DuplicatedOptfieldNameError]
+ # if an optional field tag name is used more than once
+ # @raise [RGFA::Line::PredefinedOptfieldTypeError]
+ # if the type of a predefined optional field does not
+ # respect the specified type.
+ #
+ # @return [RGFA::Line]
+ #
+ # <b>Validation levels</b>
+ #
+ # The default is 2, i.e. if a field content is changed, the user is
+ # responsible to call #validate_field!, if necessary.
+ #
+ # - 0: no validation
+ # - 1: the number of required fields must be correct; optional fields
+ # cannot be duplicated; custom optional field names must be correct;
+ # predefined optional fields must have the correct type; only some
+ # fields are validated on initialization or first-time access to
+ # the field content
+ # - 2: 1 + all fields are validated on initialization or first-time
+ # access to the field content
+ # - 3: 2 + all fields are validated on initialization and record-specific
+ # validations are run (e.g. compare segment LN tag and sequence lenght)
+ # - 4: 3 + all fields are validated on writing to string
+ # - 5: 4 + all fields are validated by get and set methods
+ #
+ def initialize(data, validate: 2, virtual: false)
+ unless self.class.const_defined?(:"RECORD_TYPE")
+ raise RuntimeError, "This class shall not be directly instantiated"
+ end
+ @validate = validate
+ @virtual = virtual
+ @datatype = {}
+ @data = {}
+ if data.kind_of?(Hash)
+ @data.merge!(data)
+ else
+ # normal initialization, data is an array of strings
+ initialize_required_fields(data)
+ initialize_optional_fields(data)
+ validate_record_type_specific_info! if @validate >= 3
+ end
+ end
+
+ # Select a subclass based on the record type
+ # @raise [RGFA::Line::UnknownRecordTypeError] if the record_type is not valid
+ # @return [Class] a subclass of RGFA::Line
+ def self.subclass(record_type)
+ case record_type.to_sym
+ when :H then RGFA::Line::Header
+ when :S then RGFA::Line::Segment
+ when :L then RGFA::Line::Link
+ when :C then RGFA::Line::Containment
+ when :P then RGFA::Line::Path
+ else
+ raise RGFA::Line::UnknownRecordTypeError,
+ "Record type unknown: '#{record_type}'"
+ end
+ end
+
+ # @return [Symbol] record type code
+ def record_type
+ self.class::RECORD_TYPE
+ end
+
+ # @return [Array<Symbol>] fields defined for this instance
+ def fieldnames
+ @data.keys
+ end
+
+ # @return [Array<Symbol>] name of the required fields
+ def required_fieldnames
+ self.class::REQFIELDS
+ end
+
+ # @return [Array<Symbol>] name of the optional fields
+ def optional_fieldnames
+ (@data.keys - self.class::REQFIELDS)
+ end
+
+ # Deep copy of a RGFA::Line instance.
+ # @return [RGFA::Line]
+ def clone
+ data_cpy = {}
+ @data.each_pair do |k, v|
+ if field_datatype(k) == :J
+ data_cpy[k] = JSON.parse(v.to_json)
+ elsif v.kind_of?(Array) or v.kind_of?(String)
+ data_cpy[k] = v.clone
+ else
+ data_cpy[k] = v
+ end
+ end
+ cpy = self.class.new(data_cpy, validate: @validate, virtual: @virtual)
+ cpy.instance_variable_set("@datatype", @datatype.clone)
+ return cpy
+ end
+
+ # Is the line virtual?
+ #
+ # Is this RGFA::Line a virtual line repreentation
+ # (i.e. a placeholder for an expected but not encountered yet line)?
+ # @api private
+ # @return [Boolean]
+ def virtual?
+ @virtual
+ end
+
+ # Make a virtual line real.
+ # @api private
+ # This is called when a line which is expected, and for which a virtual
+ # line has been created, is finally found. So the line is converted into
+ # a real line, by merging in the line information from the found line.
+ # @param real_line [RGFA::Line] the real line fou
+ def real!(real_line)
+ @virtual = false
+ real_line.data.each_pair do |k, v|
+ @data[k] = v
+ end
+ end
+
+ # @return [String] a string representation of self
+ def to_s
+ to_a.join(SEPARATOR)
+ end
+
+ # @return [Array<String>] an array of string representations of the fields
+ def to_a
+ a = [record_type]
+ required_fieldnames.each {|fn| a << field_to_s(fn, optfield: false)}
+ optional_fieldnames.each {|fn| a << field_to_s(fn, optfield: true)}
+ return a
+ end
+
+ # Returns the optional fields as an array of [fieldname, datatype, value]
+ # arrays.
+ # @return [Array<[Symbol, Symbol, Object]>]
+ def tags
+ retval = []
+ optional_fieldnames.each do |of|
+ retval << [of, get_datatype(of), get(of)]
+ end
+ return retval
+ end
+
+ # Remove an optional field from the line, if it exists;
+ # do nothing if it does not
+ # @param fieldname [Symbol] the tag name of the optfield to remove
+ # @return [Object, nil] the deleted value or nil, if the field was not defined
+ def delete(fieldname)
+ if optional_fieldnames.include?(fieldname)
+ @datatype.delete(fieldname)
+ return @data.delete(fieldname)
+ else
+ return nil
+ end
+ end
+
+ # Raises an error if the content of the field does not correspond to
+ # the field type
+ #
+ # @param fieldname [Symbol] the tag name of the field to validate
+ # @raise [RGFA::FieldParser::FormatError] if the content of the field is
+ # not valid, according to its required type
+ # @return [void]
+ def validate_field!(fieldname)
+ v = @data[fieldname]
+ t = field_or_default_datatype(fieldname, v)
+ v.validate_gfa_field!(t, fieldname)
+ return nil
+ end
+
+ # @!macro [new] field_to_s
+ # Compute the string representation of a field.
+ #
+ # @param fieldname [Symbol] the tag name of the field
+ # @param optfield [Boolean] <i>(defaults to: +false+)</i>
+ # return the tagname:datatype:value representation
+ #
+ # @raise [RGFA::Line::TagMissingError] if field is not defined
+ # @return [String] the string representation
+ def field_to_s(fieldname, optfield: false)
+ field = @data[fieldname]
+ raise RGFA::Line::TagMissingError,
+ "No value defined for tag #{fieldname}" if field.nil?
+ t = field_or_default_datatype(fieldname, field)
+ if !field.kind_of?(String)
+ field = field.to_gfa_field(datatype: t)
+ end
+ field.validate_gfa_field!(t, fieldname) if @validate >= 4
+ return optfield ? field.to_gfa_optfield(fieldname, datatype: t) : field
+ end
+
+ # Returns a symbol, which specifies the datatype of a field
+ #
+ # @param fieldname [Symbol] the tag name of the field
+ # @return [RGFA::Line::FIELD_DATATYPE] the datatype symbol
+ def get_datatype(fieldname)
+ field_or_default_datatype(fieldname, @data[fieldname])
+ end
+
+ # Set the datatype of a field.
+ #
+ # If an existing field datatype is changed, its content may become
+ # invalid (call #validate_field! if necessary).
+ #
+ # If the method is used for a required field or a predefined field,
+ # the line will use the specified datatype instead of the predefined
+ # one, resulting in a potentially invalid line.
+ #
+ # @param fieldname [Symbol] the field name (it is not required that
+ # the field exists already)
+ # @param datatype [RGFA::Line::FIELD_DATATYPE] the datatype
+ # @raise [RGFA::Line::UnknownDatatype] if +datatype+ is not
+ # a valid datatype for optional fields
+ # @return [RGFA::Line::FIELD_DATATYPE] the datatype
+ def set_datatype(fieldname, datatype)
+ unless OPTFIELD_DATATYPE.include?(datatype)
+ raise RGFA::Line::UnknownDatatype, "Unknown datatype: #{datatype}"
+ end
+ @datatype[fieldname] = datatype
+ end
+
+ # Set the value of a field.
+ #
+ # If a datatype for a new custom optional field is not set,
+ # the default for the value assigned to the field will be used
+ # (e.g. J for Hashes, i for Integer, etc).
+ #
+ # @param fieldname [Symbol] the name of the field to set
+ # (required field, predefined optional field (uppercase) or custom optional
+ # field name (lowercase))
+ # @raise [RGFA::Line::FieldnameError] if +fieldname+ is not a
+ # valid predefined or custom optional name (and +validate[:tags]+)
+ # @return [Object] +value+
+ def set(fieldname, value)
+ if @data.has_key?(fieldname) or predefined_optional_fieldname?(fieldname)
+ return set_existing_field(fieldname, value)
+ elsif (@validate == 0) or valid_custom_optional_fieldname?(fieldname)
+ define_field_methods(fieldname)
+ if !@datatype[fieldname].nil?
+ return set_existing_field(fieldname, value)
+ elsif !value.nil?
+ @datatype[fieldname] = value.default_gfa_datatype
+ return @data[fieldname] = value
+ end
+ else
+ raise RGFA::Line::FieldnameError,
+ "#{fieldname} is not an existing or predefined field or a "+
+ "valid custom optional field"
+ end
+ end
+
+ # Get the value of a field
+ # @param fieldname [Symbol] name of the field
+ # @param frozen [Boolean] <i>defaults to: +false+</i> return a frozen value;
+ # this guarantees that a validation will not be necessary on output
+ # if the field value has not been changed using #set
+ # @return [Object,nil] value of the field
+ # or +nil+ if field is not defined
+ def get(fieldname, frozen: false)
+ v = @data[fieldname]
+ if v.kind_of?(String)
+ t = field_datatype(fieldname)
+ if t != :Z and t != :seq
+ # value was not parsed or was set to a string by the user
+ return (@data[fieldname] = v.parse_gfa_field(datatype: t,
+ validate_strings:
+ @validate >= 2))
+ else
+ v.validate_gfa_field!(t, fieldname) if (@validate >= 5)
+ end
+ elsif !v.nil?
+ if (@validate >= 5)
+ t = field_datatype(fieldname)
+ v.validate_gfa_field!(t, fieldname)
+ end
+ end
+ return v
+ end
+
+ # Value of a field, raising an exception if it is not defined
+ # @param fieldname [Symbol] name of the field
+ # @raise [RGFA::Line::TagMissingError] if field is not defined
+ # @return [Object,nil] value of the field
+ def get!(fieldname)
+ v = get(fieldname)
+ raise RGFA::Line::TagMissingError,
+ "No value defined for tag #{fieldname}" if v.nil?
+ return v
+ end
+
+ # Methods are dynamically created for non-existing but valid optional
+ # field names. Methods for predefined optional fields and required fields
+ # are created dynamically for each subclass; methods for existing optional
+ # fields are created on instance initialization.
+ #
+ # ---
+ # - (Object) <fieldname>(parse=true)
+ # The parsed content of a field. See also #get.
+ #
+ # <b>Parameters:</b>
+ #
+ # <b>Returns:</b>
+ # - (String, Hash, Array, Integer, Float) the parsed content of the field
+ # - (nil) if the field does not exist, but is a valid optional field name
+ #
+ # ---
+ # - (Object) <fieldname>!(parse=true)
+ # The parsed content of a field, raising an exception if not available.
+ # See also #get!.
+ #
+ # <b>Returns:</b>
+ # - (String, Hash, Array, Integer, Float) the parsed content of the field
+ #
+ # <b>Raises:</b>
+ # - (RGFA::Line::TagMissingError) if the field does not exist
+ #
+ # ---
+ #
+ # - (self) <fieldname>=(value)
+ # Sets the value of a required or optional
+ # field, or creates a new optional field if the fieldname is
+ # non-existing but valid. See also #set, #set_datatype.
+ #
+ # <b>Parameters:</b>
+ # - +*value*+ (String|Hash|Array|Integer|Float) value to set
+ #
+ # ---
+ #
+ def method_missing(m, *args, &block)
+ field_name, operation, state = split_method_name(m)
+ if ((operation == :get or operation == :get!) and args.size > 1) or
+ (operation == :set and args.size != 1)
+ raise ArgumentError, "wrong number of arguments"
+ end
+ case state
+ when :invalid
+ super
+ when :existing
+ case operation
+ when :get
+ if args[0] == false
+ field_to_s(field_name)
+ else
+ get(field_name)
+ end
+ when :get!
+ if args[0] == false
+ field_to_s!(field_name)
+ else
+ get!(field_name)
+ end
+ when :set
+ set_existing_field(field_name, args[0])
+ return nil
+ end
+ when :valid
+ case operation
+ when :get
+ return nil
+ when :get!
+ raise RGFA::Line::TagMissingError,
+ "No value defined for tag #{field_name}"
+ when :set
+ set(field_name, args[0])
+ return nil
+ end
+ end
+ end
+
+ # Redefines respond_to? to correctly handle dynamical methods.
+ # @see #method_missing
+ def respond_to?(m, include_all=false)
+ super || (split_method_name(m)[2] != :invalid)
+ end
+
+ # @return self
+ # @param validate [Boolean] ignored (compatibility reasons)
+ def to_rgfa_line(validate: nil)
+ self
+ end
+
+ # Equivalence check
+ # @return [Boolean] does the line has the same record type,
+ # contains the same optional fields
+ # and all required and optional fields contain the same field values?
+ # @see RGFA::Line::Link#==
+ def ==(o)
+ return self.to_sym == o.to_sym if o.kind_of?(Symbol)
+ return false if (o.record_type != self.record_type)
+ return false if o.data.keys.sort != data.keys.sort
+ o.data.each do |k, v|
+ if @data[k] != o.data[k]
+ if field_to_s(k) != o.field_to_s(k)
+ return false
+ end
+ end
+ end
+ return true
+ end
+
+ # Validate the RGFA::Line instance
+ # @raise [RGFA::FieldParser::FormatError] if any field content is not valid
+ # @return [void]
+ def validate!
+ fieldnames.each {|fieldname| validate_field!(fieldname) }
+ validate_record_type_specific_info!
+ end
+
+ protected
+
+ def data
+ @data
+ end
+
+ def datatype
+ @datatype
+ end
+
+ private
+
+ def n_required_fields
+ self.class::REQFIELDS.size
+ end
+
+ def field_datatype(fieldname)
+ @datatype.fetch(fieldname, self.class::DATATYPE[fieldname])
+ end
+
+ def field_or_default_datatype(fieldname, value)
+ t = field_datatype(fieldname)
+ if t.nil?
+ t = value.default_gfa_datatype
+ @datatype[fieldname] = t
+ end
+ return t
+ end
+
+ def init_field_value(n ,t, s)
+ if @validate >= 3
+ s = s.parse_gfa_field(datatype: t, validate_strings: true)
+ elsif !DELAYED_PARSING_DATATYPES.include?(t)
+ s = s.parse_gfa_field(datatype: t, validate_strings: false)
+ end
+ @data[n] = s
+ end
+
+ def set_existing_field(fieldname, value)
+ if value.nil?
+ @data.delete(fieldname)
+ else
+ if @validate >= 5
+ field_or_default_datatype(fieldname, value)
+ value.validate_gfa_field!(field_datatype(fieldname), fieldname)
+ end
+ @data[fieldname] = value
+ end
+ end
+
+ def initialize_required_fields(strings)
+ if (@validate >= 1) and (strings.size < n_required_fields)
+ raise RGFA::Line::RequiredFieldMissingError,
+ "#{n_required_fields} required fields expected, "+
+ "#{strings.size}) found\n#{strings.inspect}"
+ end
+ n_required_fields.times do |i|
+ n = self.class::REQFIELDS[i]
+ init_field_value(n, self.class::DATATYPE[n], strings[i])
+ end
+ end
+
+ def valid_custom_optional_fieldname?(fieldname)
+ /^[a-z][a-z0-9]$/ =~ fieldname
+ end
+
+ def validate_custom_optional_fieldname!(fieldname)
+ if not valid_custom_optional_fieldname?(fieldname)
+ raise RGFA::Line::CustomOptfieldNameError,
+ "#{fieldname} is not a valid custom optional field name"
+ end
+ end
+
+ def predefined_optional_fieldname?(fieldname)
+ self.class::PREDEFINED_OPTFIELDS.include?(fieldname)
+ end
+
+ def initialize_optional_fields(strings)
+ n_required_fields.upto(strings.size-1) do |i|
+ n, t, s = strings[i].parse_gfa_optfield
+ if (@validate > 0)
+ if @data.has_key?(n)
+ raise RGFA::Line::DuplicatedOptfieldNameError,
+ "Optional field #{n} found multiple times"
+ elsif predefined_optional_fieldname?(n)
+ unless t == self.class::DATATYPE[n]
+ raise RGFA::Line::PredefinedOptfieldTypeError,
+ "Optional field #{n} must be of type "+
+ "#{self.class::DATATYPE[n]}, #{t} found"
+ end
+ elsif not valid_custom_optional_fieldname?(n)
+ raise RGFA::Line::CustomOptfieldNameError,
+ "Custom-defined optional "+
+ "fields must be lower case; found: #{n}"
+ else
+ @datatype[n] = t
+ end
+ else
+ (@datatype[n] = t) if !field_datatype(t)
+ end
+ init_field_value(n, t, s)
+ end
+ end
+
+ def split_method_name(m)
+ if @data.has_key?(m)
+ return m, :get, :existing
+ else
+ case m[-1]
+ when "!"
+ var = :get!
+ m = m[0..-2].to_sym
+ when "="
+ var = :set
+ m = m[0..-2].to_sym
+ else
+ var = :get
+ end
+ if @data.has_key?(m)
+ state = :existing
+ elsif self.class::PREDEFINED_OPTFIELDS.include?(m) or
+ valid_custom_optional_fieldname?(m)
+ state = :valid
+ else
+ state = :invalid
+ end
+ return m, var, state
+ end
+ end
+
+ def validate_record_type_specific_info!
+ end
+
+ #
+ # Define field methods for a single field
+ #
+ def define_field_methods(fieldname)
+ define_singleton_method(fieldname) do
+ get(fieldname)
+ end
+ define_singleton_method :"#{fieldname}!" do
+ get!(fieldname)
+ end
+ define_singleton_method :"#{fieldname}=" do |value|
+ set_existing_field(fieldname, value)
+ end
+ end
+
+ #
+ # This avoids calls to method_missing for fields which are already defined
+ #
+ def self.define_field_methods!
+ (self::REQFIELDS+self::PREDEFINED_OPTFIELDS).each do |fieldname|
+ define_method(fieldname) do
+ get(fieldname)
+ end
+ define_method :"#{fieldname}!" do
+ get!(fieldname)
+ end
+ define_method :"#{fieldname}=" do |value|
+ set_existing_field(fieldname, value)
+ end
+ end
+ end
+ private_class_method :define_field_methods!
+
+end
+
+# Error raised if the record_type is not one of RGFA::Line::RECORD_TYPES
+class RGFA::Line::UnknownRecordTypeError < RGFA::Error; end
+
+# Error raised if an invalid datatype symbol is found
+class RGFA::Line::UnknownDatatype < RGFA::Error; end
+
+# Error raised if an invalid fieldname symbol is found
+class RGFA::Line::FieldnameError < RGFA::Error; end
+
+# Error raised if optional tag is not present
+class RGFA::Line::TagMissingError < RGFA::Error; end
+
+# Error raised if too less required fields are specified.
+class RGFA::Line::RequiredFieldMissingError < RGFA::Error; end
+
+# Error raised if a non-predefined optional field uses upcase
+# letters.
+class RGFA::Line::CustomOptfieldNameError < RGFA::Error; end
+
+# Error raised if an optional field tag name is used more than once.
+class RGFA::Line::DuplicatedOptfieldNameError < RGFA::Error; end
+
+# Error raised if the type of a predefined optional field does not
+# respect the specified type.
+class RGFA::Line::PredefinedOptfieldTypeError < RGFA::Error; end
+
+#
+# Require the child classes
+#
+require_relative "line/header.rb"
+require_relative "line/segment.rb"
+require_relative "line/path.rb"
+require_relative "line/link.rb"
+require_relative "line/containment.rb"
+
+# Extensions to the String core class.
+#
+class String
+
+ # Parses a line of a RGFA file and creates an object of the correct
+ # record type child class of {RGFA::Line}
+ # @return [subclass of RGFA::Line]
+ # @raise [RGFA::Error] if the fields do not comply to the RGFA specification
+ # @param validate [Integer] <i>(defaults to: 2)</i>
+ # see RGFA::Line#initialize
+ def to_rgfa_line(validate: 2)
+ split(RGFA::Line::SEPARATOR).to_rgfa_line(validate: validate)
+ end
+
+end
+
+# Extensions to the Array core class.
+#
+class Array
+
+ # Parses an array containing the fields of a RGFA file line and creates an
+ # object of the correct record type child class of {RGFA::Line}
+ # @note
+ # This method modifies the content of the array; if you still
+ # need the array, you must create a copy before calling it
+ # @return [subclass of RGFA::Line]
+ # @raise [RGFA::Error] if the fields do not comply to the RGFA specification
+ # @param validate [Integer] <i>(defaults to: 2)</i>
+ # see RGFA::Line#initialize
+ def to_rgfa_line(validate: 2)
+ RGFA::Line.subclass(shift).new(self, validate: validate)
+ end
+
+end
diff --git a/lib/rgfa/line/containment.rb b/lib/rgfa/line/containment.rb
new file mode 100644
index 0000000..df57a41
--- /dev/null
+++ b/lib/rgfa/line/containment.rb
@@ -0,0 +1,87 @@
+# A containment line of a RGFA file
+class RGFA::Line::Containment < RGFA::Line
+
+ RECORD_TYPE = :C
+ REQFIELDS = [:from, :from_orient, :to, :to_orient, :pos, :overlap]
+ PREDEFINED_OPTFIELDS = [:MQ, :NM]
+ DATATYPE = {
+ :from => :lbl,
+ :from_orient => :orn,
+ :to => :lbl,
+ :to_orient => :orn,
+ :pos => :pos,
+ :overlap => :cig,
+ :MQ => :i,
+ :NM => :i,
+ }
+
+ define_field_methods!
+
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
+ # from/from_orient fields
+ def oriented_from
+ [from, from_orient].to_oriented_segment
+ end
+
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
+ # to/to_orient fields
+ def oriented_to
+ [to, to_orient].to_oriented_segment
+ end
+
+ # The from segment name, in both cases where from is a segment name (Symbol)
+ # or a segment (RGFA::Line::Segment)
+ # @return [Symbol]
+ def from_name
+ from.to_sym
+ end
+
+ # The to segment name, in both cases where to is a segment name (Symbol)
+ # or a segment (RGFA::Line::Segment)
+ # @return [Symbol]
+ def to_name
+ to.to_sym
+ end
+
+ # @return [Integer,nil] the rightmost 0-based coordinate of the contained
+ # sequence in the container; nil if the overlap is unspecified
+ def rpos
+ return nil if overlap.empty?
+ rpos = pos
+ overlap.each do |op|
+ if [:M, :D].include?(op.code)
+ rpos += op.len
+ end
+ end
+ return rpos
+ end
+
+ # Returns true if the containment is normal, false otherwise
+ #
+ # <b> Definition of normal containment </b>
+ #
+ # Each containment has an equivalent reverse containment.
+ # Consider a containment of B (length:8) in A (length:100) at position 9 of A
+ # with a cigar 1M1I2M3D4M (i.e. rpos = 19).
+ #
+ # A+ B+ 1M1I2M3D4M 9 == A- B- 4M3D2M1I1M 80
+ # A+ B- 1M1I2M3D4M 9 == A- B+ 4M3D2M1I1M 80
+ # A- B+ 1M1I2M3D4M 9 == A+ B- 4M3D2M1I1M 80
+ # A- B- 1M1I2M3D4M 9 == A+ B+ 4M3D2M1I1M 80
+ #
+ # Pos in the reverse is equal to the length of A minus the right pos
+ # of B before reversing.
+ #
+ # We require here that A != B as A == B makes no sense for containments.
+ # Thus it is always possible to express the containment using a positive
+ # from orientation.
+ #
+ # For this reason the normality is simply defined as + from orientation.
+ #
+ # @return [Boolean]
+ #
+ def normal?
+ from_orient == :+
+ end
+
+end
diff --git a/lib/rgfa/line/header.rb b/lib/rgfa/line/header.rb
new file mode 100644
index 0000000..6741101
--- /dev/null
+++ b/lib/rgfa/line/header.rb
@@ -0,0 +1,92 @@
+# A header line of a RGFA file
+#
+# For examples on how to set the header data, see {RGFA::Headers}.
+#
+# @see RGFA::Line
+class RGFA::Line::Header < RGFA::Line
+
+ RECORD_TYPE = :H
+ REQFIELDS = []
+ PREDEFINED_OPTFIELDS = [:VN]
+ DATATYPE = {
+ :VN => :Z
+ }
+
+ define_field_methods!
+
+ # Set a header value (multi-value compatible).
+ #
+ # If a field does not exist yet, set it to value. If it exists and it is a
+ # {RGFA::FieldArray}, add the value to the field array. If it exists and it
+ # is not a field array, create a field array with the previous value and
+ # the new one
+ # @param fieldname [Symbol]
+ # @param value [Object]
+ # @param datatype [RGFA::Line::OPTFIELD_DATATYPE, nil] the datatype to use;
+ # the default is to determine the datatype according to the value or the
+ # previous values present int the field
+ def add(fieldname, value, datatype=nil)
+ fieldname = fieldname.to_sym
+ prev = get(fieldname)
+ if prev.nil?
+ set_datatype(fieldname, datatype) if datatype
+ set(fieldname, value)
+ return self
+ elsif !prev.kind_of?(RGFA::FieldArray)
+ prev = RGFA::FieldArray.new(get_datatype(fieldname), [prev])
+ set_datatype(fieldname, :J)
+ set(fieldname,prev)
+ end
+ prev.push_with_validation(value, datatype, fieldname)
+ return self
+ end
+
+ # Array of optional tags data.
+ #
+ # Returns the optional fields as an array of [fieldname, datatype, value]
+ # arrays. If a field is a FieldArray, this is splitted into multiple fields
+ # with the same fieldname.
+ # @return [Array<(Symbol, Symbol, Object)>]
+ # @api private
+ def tags
+ retval = []
+ optional_fieldnames.each do |of|
+ value = get(of)
+ if value.kind_of?(RGFA::FieldArray)
+ value.each do |elem|
+ retval << [of, value.datatype, elem]
+ end
+ else
+ retval << [of, get_datatype(of), value]
+ end
+ end
+ return retval
+ end
+
+ # Split the header line into single-tag lines.
+ #
+ # If a tag is a FieldArray, this is splitted into multiple fields
+ # with the same fieldname.
+ # @return [Array<RGFA::Line::Header>]
+ # @api private
+ def split
+ tags.map do |tagname, datatype, value|
+ h = RGFA::Line::Header.new([], validate: @validate)
+ h.set_datatype(tagname, datatype)
+ h.set(tagname, value)
+ h
+ end
+ end
+
+ # Merge an additional {RGFA::Line::Header} line into this header line.
+ # @param gfa_line [RGFA::Line::Header] the header line to merge
+ # @return [self]
+ # @api private
+ def merge(gfa_line)
+ gfa_line.optional_fieldnames.each do |of|
+ add(of, gfa_line.get(of), gfa_line.get_datatype(of))
+ end
+ self
+ end
+
+end
diff --git a/lib/rgfa/line/link.rb b/lib/rgfa/line/link.rb
new file mode 100644
index 0000000..8796fef
--- /dev/null
+++ b/lib/rgfa/line/link.rb
@@ -0,0 +1,379 @@
+# A link connects two segments, or a segment to itself.
+#
+class RGFA::Line::Link < RGFA::Line
+
+ RECORD_TYPE = :L
+ REQFIELDS = [:from, :from_orient, :to, :to_orient, :overlap]
+ PREDEFINED_OPTFIELDS = [:MQ, :NM, :RC, :FC, :KC]
+ DATATYPE = {
+ :from => :lbl,
+ :from_orient => :orn,
+ :to => :lbl,
+ :to_orient => :orn,
+ :overlap => :cig,
+ :MQ => :i,
+ :NM => :i,
+ :RC => :i,
+ :FC => :i,
+ :KC => :i,
+ }
+
+ define_field_methods!
+
+ # The other segment of a link
+ # @param segment [RGFA::Line::Segment, Symbol] segment name or instance
+ # @raise [RGFA::LineMissingError]
+ # if segment is not involved in the link
+ # @return [Symbol] the name of the other segment of the link
+ # if circular, then +segment+
+ def other(segment)
+ segment_name =
+ (segment.kind_of?(RGFA::Line::Segment) ? segment.name : segment.to_sym)
+ if segment_name == from.to_sym
+ to
+ elsif segment_name == to.to_sym
+ from
+ else
+ raise RGFA::LineMissingError,
+ "Link #{self} does not involve segment #{segment_name}"
+ end
+ end
+
+ # @return [Boolean] is the from and to segments are equal
+ def circular?
+ from.to_sym == to.to_sym
+ end
+
+ # @return [Boolean] is the from and to segments are equal
+ def circular_same_end?
+ from_end == to_end
+ end
+
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
+ # from/from_orient fields
+ def oriented_from
+ [from, from_orient].to_oriented_segment
+ end
+
+ # @return [RGFA::OrientedSegment] the oriented segment represented by the
+ # to/to_orient fields
+ def oriented_to
+ [to, to_orient].to_oriented_segment
+ end
+
+ # @return [RGFA::SegmentEnd] the segment end represented by the
+ # from/from_orient fields
+ def from_end
+ [from, from_orient == :+ ? :E : :B].to_segment_end
+ end
+
+ # @return [RGFA::SegmentEnd] the segment end represented by the
+ # to/to_orient fields
+ def to_end
+ [to, to_orient == :+ ? :B : :E].to_segment_end
+ end
+
+ # Signature of the segment ends, for debugging
+ # @api private
+ def segment_ends_s
+ [from_end.to_s, to_end.to_s].join("---")
+ end
+
+ # @param segment_end [RGFA::SegmentEnd] one of the two segment ends
+ # of the link
+ # @return [RGFA::SegmentEnd] the other segment end
+ #
+ # @raise [ArgumentError] if segment_end is not a valid segment end
+ # representation
+ # @raise [RuntimeError] if segment_end is not a segment end of the link
+ def other_end(segment_end)
+ segment_end = segment_end.to_segment_end
+ if (from_end == segment_end)
+ return to_end
+ elsif (to_end == segment_end)
+ return from_end
+ else
+ raise "Segment end '#{segment_end.inspect}' not found\n"+
+ "(from=#{from_end.inspect};to=#{to_end.inspect})"
+ end
+ end
+
+ # The from segment name, in both cases where from is a segment name (Symbol)
+ # or a segment (RGFA::Line::Segment)
+ # @return [Symbol]
+ def from_name
+ from.to_sym
+ end
+
+ # The to segment name, in both cases where to is a segment name (Symbol)
+ # or a segment (RGFA::Line::Segment)
+ # @return [Symbol]
+ def to_name
+ to.to_sym
+ end
+
+ # Returns true if the link is normal, false otherwise
+ #
+ # == Definition of normal link
+ #
+ # Each link has an equivalent reverse link. Consider a link of A to B
+ # with a overlap 1M1I2M:
+ #
+ # from+ to to+ (1M1I2M) == to- to from- (2M1D1M)
+ # from- to to- (1M1I2M) == to+ to from+ (2M1D1M)
+ # from+ to to- (1M1I2M) == to+ to from- (2M1D1M)
+ # from- to to+ (1M1I2M) == to- to from+ (2M1D1M)
+ #
+ # Consider also the special case, where from == to and the overlap is not
+ # specified, or equal to its reverse:
+ #
+ # from+ to from+ (*) == from- to from- (*) # left has a +; right has no +
+ # from- to from- (*) == from+ to from+ (*) # left has no +; right has a +
+ # from+ to from- (*) == from+ to from- (*) # left == right
+ # from- to from+ (*) == from- to from+ (*) # left == right
+ #
+ # Thus we define a link as normal if:
+ # - from < to (lexicographical comparison of segments)
+ # - from == to and overlap.to_s < reverse_overlap.to_s
+ # - from == to, overlap == reverse_overlap and at least one orientation is +
+ #
+ # @return [Boolean]
+ #
+ def normal?
+ if from_name < to_name
+ return true
+ elsif from_name > to_name
+ return false
+ else
+ overlap_s = overlap.to_s
+ reverse_overlap_s = reverse_overlap.to_s
+ if overlap_s < reverse_overlap_s
+ return true
+ elsif overlap_s > reverse_overlap_s
+ return false
+ else
+ return [from_orient, to_orient].include?(:+)
+ end
+ end
+ end
+
+ # Returns the unchanged link if the link is normal,
+ # otherwise reverses the link and returns it.
+ #
+ # @note The path references are not corrected by this method; therefore
+ # the method shall be used before the link is embedded in a graph.
+ #
+ # @return [RGFA::Line::Link] self
+ def normalize!
+ reverse! if !normal?
+ end
+
+ # Creates a link with both strands of the sequences inverted.
+ # The CIGAR operations (order/type) are inverted as well.
+ # Optional fields are left unchanged.
+ #
+ # @note The path references are not copied to the reverse link.
+ #
+ # @note This method shall be overridden if custom optional fields
+ # are defined, which have a ``reverse'' operation which determines
+ # their value in the equivalent but reverse link.
+ #
+ # @return [RGFA::Line::Link] the inverted link.
+ def reverse
+ l = self.clone
+ l.from = to
+ l.from_orient = (to_orient == :+ ? :- : :+)
+ l.to = from
+ l.to_orient = (from_orient == :+ ? :- : :+)
+ l.overlap = reverse_overlap
+ l
+ end
+
+ # Reverses the link inplace, i.e. sets:
+ # from = to
+ # from_orient = other_orient(to_orient)
+ # to = from
+ # to_orient = other_orient(from_orient)
+ # overlap = reverse_overlap.
+ #
+ # The optional fields are left unchanged.
+ #
+ # @note The path references are not reversed by this method; therefore
+ # the method shall be used before the link is embedded in a graph.
+ #
+ # @note This method shall be overridden if custom optional fields
+ # are defined, which have a ``reverse'' operation which determines
+ # their value in the equivalent but reverse link.
+ #
+ # @return [RGFA::Line::Link] self
+ def reverse!
+ tmp = self.from
+ self.from = self.to
+ self.to = tmp
+ tmp = self.from_orient
+ self.from_orient = (self.to_orient == :+) ? :- : :+
+ self.to_orient = (tmp == :+) ? :- : :+
+ self.overlap = self.reverse_overlap
+ return self
+ end
+
+ # Paths for which the link is required.
+ #
+ # The return value is an empty array
+ # if the link is not embedded in a graph.
+ #
+ # Otherwise, an array of tuples path/boolean is returned.
+ # The boolean value tells
+ # if the link is used in direct (true) or reverse direction (false)
+ # in the path.
+ # @return [Array<Array<(RGFA::Line::Path, Boolean)>>]
+ def paths
+ @paths ||= []
+ @paths
+ end
+
+ # Compute the overlap when the strand of both sequences is inverted.
+ #
+ # @return [RGFA::CIGAR]
+ def reverse_overlap
+ self.overlap.reverse
+ end
+
+ #
+ # Compares two links and determine their equivalence.
+ # Thereby, optional fields are not considered.
+ #
+ # @note Inverting the strand of both links and reversing
+ # the CIGAR operations (order/type), one obtains a
+ # reverse but equivalent link.
+ #
+ # @param other [RGFA::Line::Link] a link
+ # @return [Boolean] are self and other equivalent?
+ # @see #==
+ # @see #same?
+ # @see #reverse?
+ def eql?(other)
+ same?(other) or reverse?(other)
+ end
+
+ # Compares the optional fields of two links.
+ #
+ # @note This method shall be overridden if custom optional fields
+ # are defined, which have a ``reverse'' operation which determines
+ # their value in the equivalent but reverse link.
+ #
+ # @param other [RGFA::Line::Link] a link
+ # @return [Boolean] are self and other equivalent?
+ # @see #==
+ def eql_optional?(other)
+ (self.optional_fieldnames.sort == other.optional_fieldnames.sort) and
+ optional_fieldnames.each {|fn| self.get(fn) == other.get(fn)}
+ end
+
+ # Compares two links and determine their equivalence.
+ # Optional fields must have the same content.
+ #
+ # @note Inverting the strand of both links and reversing
+ # the CIGAR operations (order/type), one obtains an equivalent
+ # link.
+ #
+ # @param other [RGFA::Line::Link] a link
+ # @return [Boolean] are self and other equivalent?
+ # @see #eql?
+ # @see #eql_optional?
+ #def ==(other)
+ # eql?(other) and eql_optional?(other)
+ #end
+
+ # Compares two links and determine their equivalence.
+ # Thereby, optional fields are not considered.
+ #
+ # @param other [RGFA::Line::Link] a link
+ # @return [Boolean] are self and other equivalent?
+ # @see #eql?
+ # @see #reverse?
+ # @see #==
+ def same?(other)
+ (from_end == other.from_end and
+ to_end == other.to_end and
+ overlap == other.overlap)
+ end
+
+ # Compares the reverse of the link to another link
+ # and determine their equivalence.
+ # Thereby, optional fields are not considered.
+ #
+ # @param other [RGFA::Line::Link] the other link
+ # @return [Boolean] are the reverse of self and other equivalent?
+ # @see #eql?
+ # @see #same?
+ # @see #==
+ def reverse?(other)
+ (from_end == other.to_end and
+ to_end == other.from_end and
+ overlap == other.reverse_overlap)
+ end
+
+ # Computes an hash for including a link in an Hash tables,
+ # so that the hash of a link and its reverse is the same.
+ # Thereby, optional fields are not considered.
+ # @see #eql?
+ def hash
+ from_end.hash + to_end.hash + overlap.hash + reverse_overlap.to_s.hash
+ end
+
+ # Compares a link and optionally the reverse link,
+ # with two oriented_segments and optionally an overlap.
+ # @param [RGFA::OrientedSegment] other_oriented_from
+ # @param [RGFA::OrientedSegment] other_oriented_to
+ # @param equivalent [Boolean] shall the reverse link also be considered?
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
+ # @return [Boolean] does the link or, if +equivalent+,
+ # the reverse link go from the first
+ # oriented segment to the second with an overlap equal to the provided one
+ # (if not empty)?
+ def compatible?(other_oriented_from, other_oriented_to, other_overlap = [],
+ equivalent = true)
+ other_overlap = other_overlap.to_cigar
+ is_direct = compatible_direct?(other_oriented_from, other_oriented_to,
+ other_overlap)
+ if is_direct
+ return true
+ elsif equivalent
+ return compatible_reverse?(other_oriented_from, other_oriented_to,
+ other_overlap)
+ else
+ return false
+ end
+ end
+
+ # Compares a link with two oriented segments and optionally an overlap.
+ # @param [RGFA::OrientedSegment] other_oriented_from
+ # @param [RGFA::OrientedSegment] other_oriented_to
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
+ # @return [Boolean] does the link go from the first
+ # oriented segment to the second with an overlap equal to the provided one
+ # (if not empty)?
+ def compatible_direct?(other_oriented_from, other_oriented_to,
+ other_overlap = [])
+ (oriented_from == other_oriented_from and
+ oriented_to == other_oriented_to) and
+ (overlap.empty? or other_overlap.empty? or (overlap == other_overlap))
+ end
+
+ # Compares the reverse link with two oriented segments and optionally an
+ # overlap.
+ # @param [RGFA::OrientedSegment] other_oriented_from
+ # @param [RGFA::OrientedSegment] other_oriented_to
+ # @param [RGFA::CIGAR] other_overlap compared only if not empty
+ # @return [Boolean] does the reverse link go from the first
+ # oriented segment to the second with an overlap equal to the provided one
+ # (if not empty)?
+ def compatible_reverse?(other_oriented_from, other_oriented_to,
+ other_overlap = [])
+ (oriented_to == other_oriented_from.invert_orient and
+ oriented_from == other_oriented_to.invert_orient) and
+ (overlap.empty? or other_overlap.empty? or (overlap == other_overlap))
+ end
+
+end
diff --git a/lib/rgfa/line/path.rb b/lib/rgfa/line/path.rb
new file mode 100644
index 0000000..9ae9c5f
--- /dev/null
+++ b/lib/rgfa/line/path.rb
@@ -0,0 +1,106 @@
+# A path line of a RGFA file
+class RGFA::Line::Path < RGFA::Line
+
+ RECORD_TYPE = :P
+ REQFIELDS = [:path_name, :segment_names, :cigars]
+ PREDEFINED_OPTFIELDS = []
+ DATATYPE = {
+ :path_name => :lbl,
+ :segment_names => :lbs,
+ :cigars => :cgs,
+ }
+
+ define_field_methods!
+
+ # @note The field names are derived from the RGFA specification at:
+ # https://github.com/pmelsted/RGFA-spec/blob/master/RGFA-spec.md#path-line
+ # and were made all downcase with _ separating words;
+ # the cigar and segment_name regexps and name were changed to better
+ # implement what written in the commentaries of the specification
+ # (i.e. name pluralized and regexp changed to a comma-separated list
+ # for segment_name of segment names and orientations and for cigar of
+ # CIGAR strings);
+
+ # @return [Symbol] name of the path as symbol
+ def to_sym
+ name.to_sym
+ end
+
+ # Is the path circular? In this case the number of CIGARs must be
+ # equal to the number of segments.
+ # @return [Boolean]
+ def circular?
+ self.cigars.size == self.segment_names.size
+ end
+
+ # Is the path linear? This is the case when the number of CIGARs
+ # is equal to the number of segments minus 1, or the CIGARs are
+ # represented by a single "*".
+ def linear?
+ !circular?
+ end
+
+ # Are the cigars a single "*"? This is a compact representation of
+ # a linear path where all CIGARs are "*"
+ # @return [Boolean]
+ def undef_cigars?
+ self.cigars.size == 1 and self.cigars[0].empty?
+ end
+
+ # The links to which the path refers; it can be an empty array
+ # (e.g. from a line which is not embedded in a graph);
+ # the boolean is true if the equivalent reverse link is used.
+ # @return [Array<RGFA::Line::Link, Boolean>]
+ def links
+ @links ||= []
+ @links
+ end
+
+ # computes the list of links which are required to support
+ # the path
+ # @return [Array<[RGFA::OrientedSegment, RGFA::OrientedSegment, RGFA::Cigar]>]
+ # an array, which elements are 3-tuples (from oriented segment,
+ # to oriented segment, cigar)
+ def required_links
+ has_undef_cigars = self.undef_cigars?
+ retval = []
+ self.segment_names.size.times do |i|
+ j = i+1
+ if j == self.segment_names.size
+ circular? ? j = 0 : break
+ end
+ cigar = has_undef_cigars ? [] : self.cigars[i]
+ retval << [self.segment_names[i], self.segment_names[j], cigar]
+ end
+ retval
+ end
+
+ private
+
+ def validate_lists_size!
+ n_cigars = self.cigars.size
+ n_segments = self.segment_names.size
+ if n_cigars == n_segments - 1
+ # case 1: linear path
+ return true
+ elsif n_cigars == 1 and self.cigars[0].empty?
+ # case 2: linear path, single "*" to represent cigars which are all "*"
+ return true
+ elsif n_cigars == n_segments
+ # case 3: circular path
+ else
+ raise RGFA::Line::Path::ListLengthsError,
+ "Path has #{n_segments} oriented segments, "+
+ "but #{n_cigars} CIGARs"
+ end
+ end
+
+ def validate_record_type_specific_info!
+ validate_lists_size!
+ end
+
+
+end
+
+# Error raised if number of segments and cigars are not consistent
+class RGFA::Line::Path::ListLengthsError < RGFA::Error; end
diff --git a/lib/rgfa/line/segment.rb b/lib/rgfa/line/segment.rb
new file mode 100644
index 0000000..1b59226
--- /dev/null
+++ b/lib/rgfa/line/segment.rb
@@ -0,0 +1,207 @@
+# A segment line of a RGFA file
+class RGFA::Line::Segment < RGFA::Line
+
+ RECORD_TYPE = :S
+ REQFIELDS = [:name, :sequence]
+ PREDEFINED_OPTFIELDS = [:LN, :RC, :FC, :KC]
+ DATATYPE = {
+ :name => :lbl,
+ :sequence => :seq,
+ :LN => :i,
+ :RC => :i,
+ :FC => :i,
+ :KC => :i
+ }
+
+ define_field_methods!
+
+ attr_writer :links, :containments, :paths
+
+ # References to the links in which the segment is involved.
+ #
+ # @!macro references_table
+ # The references are in four arrays which are
+ # accessed from a nested hash table. The first key is
+ # the direction (from or to), the second is the orientation
+ # (+ or -).
+ #
+ # @example
+ # segment.links[:from][:+]
+ #
+ # @return [Hash{RGFA::Line::DIRECTION => Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Link>}}]
+ def links
+ @links ||= {:from => {:+ => [], :- => []},
+ :to => {:+ => [], :- => []}}
+ @links
+ end
+
+ # References to the containments in which the segment is involved.
+ # @!macro references_table
+ #
+ # @example
+ # segment.containments[:from][:+]
+ #
+ # @return [Hash{RGFA::Line::DIRECTION => Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Containment>}}]
+ def containments
+ @containments ||= {:from => {:+ => [], :- => []},
+ :to => {:+ => [], :- => []}}
+ @containments
+ end
+
+ # References to the containments in which the segment is involved.
+ #
+ # The references are in two arrays which are
+ # accessed from a hash table. The key is the orientation
+ # (+ or -).
+ #
+ # @example
+ # segment.paths[:+]
+ #
+ # @return [Hash{RGFA::Line::ORIENTATION => Array<RGFA::Line::Path>}]
+ def paths
+ @paths ||= {:+ => [], :- => []}
+ @paths
+ end
+
+ # All containments where a segment is involved.
+ # @!macro this_is_a_copy
+ # @note the list shall be considered read-only, as this
+ # is a copy of the original arrays of references, concatenated
+ # to each other.
+ def all_containments
+ l = self.containments
+ l[:from][:+] + l[:from][:-] + l[:to][:+] + l[:to][:-]
+ end
+
+ # All links where the segment is involved.
+ # @!macro this_is_a_copy
+ def all_links
+ l = self.links
+ l[:from][:+] + l[:from][:-] + l[:to][:+] + l[:to][:-]
+ end
+
+ # All links and containments where the segment is involved.
+ # @!macro this_is_a_copy
+ def all_connections
+ all_links + all_containments
+ end
+
+ # All paths where the segment is involved.
+ # @!macro this_is_a_copy
+ def all_paths
+ pt = self.paths
+ pt[:+] + pt[:-]
+ end
+
+ # All paths, links and containments where the segment is involved.
+ # @!macro this_is_a_copy
+ def all_references
+ all_connections + all_paths
+ end
+
+ # @raise [RGFA::Line::Segment::InconsistentLengthError]
+ # if sequence length and LN tag are not consistent.
+ def validate_length!
+ if sequence != "*" and optional_fieldnames.include?(:LN)
+ if self.LN != sequence.length
+ raise RGFA::Line::Segment::InconsistentLengthError,
+ "Length in LN tag (#{self.LN}) "+
+ "is different from length of sequence field (#{sequence.length})"
+ end
+ end
+ end
+
+ # @!macro [new] length
+ # @return [Integer] value of LN tag, if segment has LN tag
+ # @return [Integer] sequence length if no LN and sequence not "*"
+ # @return [nil] if sequence is "*"
+ # @see #length!
+ def length
+ if self.LN
+ self.LN
+ elsif sequence != "*"
+ sequence.length
+ else
+ nil
+ end
+ end
+
+ # @!macro length
+ # @!macro [new] length_needed
+ # @raise [RGFA::Line::Segment::UndefinedLengthError] if not an LN tag and
+ # the sequence is "*"
+ # @see #length
+ def length!
+ l = self.length()
+ raise RGFA::Line::Segment::UndefinedLengthError,
+ "No length information available" if l.nil?
+ return l
+ end
+
+ # @!macro [new] coverage
+ # The coverage computed from a count_tag.
+ # If unit_length is provided then: count/(length-unit_length+1),
+ # otherwise: count/length.
+ # The latter is a good approximation if length >>> unit_length.
+ # @param [Symbol] count_tag <i>(defaults to +:RC+)</i>
+ # integer tag storing the count, usually :KC, :RC or :FC
+ # @param [Integer] unit_length the (average) length of a read (for
+ # :RC), fragment (for :FC) or k-mer (for :KC)
+ # @return [Integer] coverage, if count_tag and length are defined
+ # @return [nil] otherwise
+ # @see #coverage!
+ def coverage(count_tag: :RC, unit_length: 1)
+ if optional_fieldnames.include?(count_tag) and self.length
+ return (self.get(count_tag).to_f)/(self.length-unit_length+1)
+ else
+ return nil
+ end
+ end
+
+ # @see #coverage
+ # @!macro coverage
+ # @raise [RGFA::Line::TagMissingError] if segment does not have count_tag
+ # @!macro length_needed
+ def coverage!(count_tag: :RC, unit_length: 1)
+ c = coverage(count_tag: count_tag, unit_length: unit_length)
+ if c.nil?
+ self.length!
+ raise RGFA::Line::TagMissingError,
+ "Tag #{count_tag} undefined for segment #{name}"
+ else
+ return c
+ end
+ end
+
+ # @return string representation of the segment
+ # @param [Boolean] without_sequence if +true+, output "*" instead of sequence
+ def to_s(without_sequence: false)
+ if !without_sequence
+ return super()
+ else
+ saved = self.sequence
+ self.sequence = "*"
+ retval = super()
+ self.sequence = saved
+ return retval
+ end
+ end
+
+ # @return [Symbol] name of the segment as symbol
+ def to_sym
+ name.to_sym
+ end
+
+ private
+
+ def validate_record_type_specific_info!
+ validate_length!
+ end
+
+end
+
+# Error raised if length of segment cannot be computed
+class RGFA::Line::Segment::UndefinedLengthError < RGFA::Error; end
+
+# Error raised if length of segment and LN are not consistent
+class RGFA::Line::Segment::InconsistentLengthError < RGFA::Error; end
diff --git a/lib/rgfa/linear_paths.rb b/lib/rgfa/linear_paths.rb
new file mode 100644
index 0000000..d3c2588
--- /dev/null
+++ b/lib/rgfa/linear_paths.rb
@@ -0,0 +1,285 @@
+require_relative "segment_ends_path"
+
+#
+# Methods for the RGFA class, which allow to find and merge linear paths.
+#
+module RGFA::LinearPaths
+
+ require "set"
+
+ #
+ # Find a path without branches.
+ #
+ # The path must
+ # include +segment+ and excludes segments in +exclude+.
+ # Any segment used in the returned path will be added to +exclude+
+ #
+ # @param s [String|RGFA::Line::Segment] a segment name or instance
+ # @param exclude [Set<String>] a set of segment names to exclude from the path
+ # @return [Array<RGFA::SegmentEnd>]
+ #
+ def linear_path(s, exclude = Set.new)
+ s = s.to_sym
+ cs = connectivity(s)
+ segpath = RGFA::SegmentEndsPath.new()
+ [:B, :E].each_with_index do |et, i|
+ if cs[i] == 1
+ exclude << s
+ segpath.pop
+ segpath += traverse_linear_path(RGFA::SegmentEnd.new([s, et]), exclude)
+ end
+ end
+ return (segpath.size < 2) ? nil : segpath
+ end
+
+ # Find all unbranched paths in the graph.
+ #
+ # @return [Array<Array<RGFA::SegmentEnd>>]
+ def linear_paths
+ exclude = Set.new
+ retval = []
+ segnames = segment_names
+ progress_log_init(:linear_paths, "segments", segnames.size,
+ "Detect linear paths (#{segnames.size} segments)") if @progress
+ segnames.each do |sn|
+ progress_log(:linear_paths) if @progress
+ next if exclude.include?(sn)
+ retval << linear_path(sn, exclude)
+ end
+ progress_log_end(:linear_paths)
+ return retval.compact
+ end
+
+ # Merge a linear path, i.e. a path of segments without extra-branches
+ # @!macro [new] merge_lim
+ # Limitations: all containments und paths involving merged segments are
+ # deleted.
+ #
+ # @param segpath [Array<RGFA::SegmentEnd>] a linear path, such as that
+ # retrieved by {#linear_path}
+ # @!macro [new] merge_options
+ # @param options [Hash] optional keyword arguments
+ # @option options [String, :short, nil] :merged_name (nil)
+ # if nil, the merged_name is automatically computed; if :short,
+ # a name is computed starting with "merged1" and calling next until
+ # an available name is founf; if String, the name to use
+ # @option options [Boolean] :cut_counts (false)
+ # if true, total count in merged segment m, composed of segments
+ # s of set S is multiplied by the factor Sum(|s in S|)/|m|
+ #
+ # @return [RGFA] self
+ # @see #merge_linear_paths
+ def merge_linear_path(segpath, **options)
+ return if segpath.size < 2
+ segpath.map!{|se|se.to_segment_end}
+ if segpath[1..-2].any? {|sn,et| connectivity(sn) != [1,1]}
+ raise ArgumentError, "The specified path is not linear"
+ end
+ merged, first_reversed, last_reversed =
+ create_merged_segment(segpath, options)
+ self << merged
+ link_merged(merged.name, segpath.first.to_segment_end.invert_end_type,
+ first_reversed)
+ link_merged(merged.name, segpath.last, last_reversed)
+ segpath.each do |sn_et|
+ delete_segment(sn_et.segment)
+ progress_log(:merge_linear_paths, 0.05) if @progress
+ end
+ self
+ end
+
+ # Merge all linear paths in the graph, i.e.
+ # paths of segments without extra-branches
+ # @!macro merge_lim
+ # @!macro merge_options
+ #
+ # @return [RGFA] self
+ def merge_linear_paths(**options)
+ paths = linear_paths
+ psize = paths.flatten.size / 2
+ progress_log_init(:merge_linear_paths, "segments", psize,
+ "Merge #{paths.size} linear paths (#{psize} segments)") if @progress
+ paths.each do |path|
+ merge_linear_path(path, **options)
+ end
+ progress_log_end(:merge_linear_paths)
+ self
+ end
+
+ private
+
+ # Traverse the links, starting from the segment +from+ :E end if
+ # +traverse_from_E_end+ is true, or :B end otherwise.
+ #
+ # If any segment after +from+ is found whose name is included in +exclude+
+ # the traversing is interrupted. The +exclude+ set is updated, so that
+ # circular paths are avoided.
+ #
+ # *Arguments*:
+ # - +from+ -> first segment
+ # - +traverse_from_E_end+ -> if true, start from E end, otherwise from B end
+ # - +exclude+ -> Set of names of already visited segments
+ #
+ # *Side Effects*:
+ # - Any element added to the returned list is also added to +exclude+
+ #
+ # *Returns*:
+ # - An array of segment names of the unbranched path.
+ # If +from+ is not an element of an unbranched path then [].
+ # Otherwise the first (and possibly only) element is +from+.
+ # All elements in the index range 1..-2 are :internal.
+ def traverse_linear_path(segment_end, exclude)
+ list = RGFA::SegmentEndsPath.new()
+ current = segment_end
+ loop do
+ after = links_of(current)
+ before = links_of(current.to_segment_end.invert_end_type)
+ cs = connectivity_symbols(before.size, after.size)
+ if cs == [1,1] or list.empty?
+ list << current
+ exclude << current.name
+ l = after.first
+ current = l.other_end(current).invert_end_type
+ break if exclude.include?(current.name)
+ elsif cs[0] == 1
+ list << current
+ exclude << current.name
+ break
+ else
+ break
+ end
+ end
+ return segment_end.end_type == :B ? list.reverse : list
+ end
+
+ def sum_of_counts(segpath, multfactor = 1)
+ retval = {}
+ segs = segpath.map {|sn,et|segment!(sn)}
+ [:KC, :RC, :FC].each do |count_tag|
+ segs.each do |s|
+ if s.optional_fieldnames.include?(count_tag)
+ retval[count_tag] ||= 0
+ retval[count_tag] += s.get(count_tag)
+ end
+ end
+ if retval[count_tag]
+ retval[count_tag] = (retval[count_tag] * multfactor).to_i
+ end
+ end
+ return retval
+ end
+
+ def reverse_segment_name(name, separator)
+ name.to_s.split(separator).map do |part|
+ openp = part[0] == "("
+ part = part[1..-1] if openp
+ closep = part[-1] == ")"
+ part = part[0..-2] if closep
+ part = (part[-1] == "^") ? part[0..-2] : part+"^"
+ part += ")" if openp
+ part = "(#{part}" if closep
+ part
+ end.reverse.join(separator)
+ end
+
+ def reverse_pos_array(pos_array, lastpos)
+ return nil if pos_array.nil? or lastpos.nil?
+ pos_array.map {|pos| lastpos - pos + 1}.reverse
+ end
+
+ def add_segment_to_merged(merged, segment, reversed, cut, init, options)
+ s = (reversed ? segment.sequence.rc[cut..-1] : segment.sequence[cut..-1])
+ if init
+ merged.sequence = s
+ merged.name = (options[:merged_name].nil? ?
+ segment.name : options[:merged_name])
+ merged.LN = segment.LN
+ else
+ (segment.sequence == "*") ? (merged.sequence = "*")
+ : (merged.sequence += s)
+ if options[:merged_name].nil?
+ merged.name = "#{merged.name}_#{segment.name}"
+ end
+ if merged.LN
+ segment.LN ? merged.LN += (segment.LN - cut)
+ : merged.LN = nil
+ end
+ end
+ end
+
+ def create_merged_segment(segpath, options)
+ merged = segment!(segpath.first.first).clone
+ total_cut = 0
+ a = segpath.first
+ first_reversed = (a.end_type == :B)
+ last_reversed = nil
+ if options[:merged_name] == :short
+ forbidden = (segment_names + path_names)
+ options[:merged_name] = "merged1"
+ while forbidden.include?(options[:merged_name])
+ options[:merged_name] = options[:merged_name].next
+ end
+ end
+ add_segment_to_merged(merged, segment(a.segment), first_reversed, 0, true,
+ options)
+ progress_log(:merge_linear_paths, 0.95) if @progress
+ (segpath.size-1).times do |i|
+ b = segpath[i+1].to_segment_end.invert_end_type
+ l = link!(a, b)
+ if l.overlap == []
+ cut = 0
+ elsif l.overlap.all?{|op|[:M, :"="].include?(op.code)}
+ cut = l.overlap.map(&:len).inject(:+)
+ else
+ raise ArgumentError,
+ "Merging is only allowed if all operations are M/="
+ end
+ total_cut += cut
+ last_reversed = (b[1] == :E)
+ add_segment_to_merged(merged, segment(b.segment), last_reversed, cut,
+ false, options)
+ a = b.to_segment_end.invert_end_type
+ if @progress
+ progress_log(:merge_linear_paths, 0.95)
+ end
+ end
+ if merged.sequence != "*"
+ if merged.LN.nil?
+ merged.LN = merged.sequence.length
+ elsif @validate and merged.LN != merged.sequence.length
+ raise RGFA::Line::Segment::InconsistentLengthError,
+ "Computed sequence length #{merged.sequence.length} "+
+ "and computed LN #{merged.LN} differ"
+ end
+ end
+ if merged.LN.nil?
+ [:KC, :RC, :FC].each {|count_tag| merged.set(count_tag, nil)}
+ else
+ sum_of_counts(segpath, (options[:cut_counts] ?
+ merged.LN.to_f / (total_cut+merged.LN) : 1)).
+ each do |count_tag, count|
+ merged.set(count_tag, count)
+ end
+ end
+ return merged, first_reversed, last_reversed
+ end
+
+ def link_merged(merged_name, segment_end, reversed)
+ links_of(segment_end).each do |l|
+ l2 = l.clone
+ if l2.to == segment_end.first
+ l2.to = merged_name
+ if reversed
+ l2.to_orient = RGFA::OrientedSegment.invert(l2.to_orient)
+ end
+ else
+ l2.from = merged_name
+ if reversed
+ l2.from_orient = RGFA::OrientedSegment.invert(l2.from_orient)
+ end
+ end
+ self << l2
+ end
+ end
+
+end
diff --git a/lib/rgfa/lines.rb b/lib/rgfa/lines.rb
new file mode 100644
index 0000000..8a719f8
--- /dev/null
+++ b/lib/rgfa/lines.rb
@@ -0,0 +1,155 @@
+require_relative "error"
+
+#
+# Methods for the RGFA class, which allow to handle lines of multiple types.
+#
+module RGFA::Lines
+
+ # Add a line to a RGFA
+ #
+ # @overload <<(gfa_line_string)
+ # @param [String] gfa_line_string representation of a RGFA line
+ # @overload <<(gfa_line)
+ # @param [RGFA::Line] gfa_line instance of a subclass of RGFA::Line
+ # @raise [RGFA::DuplicatedLabelError] if multiple segment or path lines
+ # with the same name are added
+ # @return [RGFA] self
+ def <<(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ rt = gfa_line.record_type
+ case rt
+ when :H
+ add_header(gfa_line)
+ when :S
+ add_segment(gfa_line)
+ when :L
+ add_link(gfa_line)
+ when :C
+ add_containment(gfa_line)
+ when :P
+ add_path(gfa_line)
+ else
+ raise # this never happens, as already catched by gfa_line init
+ end
+ return self
+ end
+
+ # Delete elements from the RGFA graph
+ # @overload rm(segment)
+ # @param segment [String, RGFA::Line::Segment] segment name or instance
+ # @overload rm(path)
+ # @param path [String, RGFA::Line::Segment] path name or instance
+ # @overload rm(link)
+ # @param link [RGFA::Line::Link] link
+ # @overload rm(containment)
+ # @param link [RGFA::Line::Containment] containment
+ # @overload rm(:headers)
+ # Remove all headers
+ # @overload rm(array)
+ # Calls {#rm} using each element of the array as argument
+ # @param array [Array]
+ # @overload rm(method_name, *args)
+ # Call a method of RGFA instance, then {#rm} for each returned value
+ # @param method_name [Symbol] method to call
+ # @param args arguments of the method
+ # @return [RGFA] self
+ def rm(x, *args)
+ if x.kind_of?(RGFA::Line)
+ raise ArgumentError,
+ "One argument required if first RGFA::Line" if !args.empty?
+ case x.record_type
+ when :H then raise ArgumentError, "Cannot remove single header lines"
+ when :S then delete_segment(x)
+ when :P then delete_path(x)
+ when :L then delete_link(x)
+ when :C then delete_containment(x)
+ end
+ elsif x.kind_of?(Symbol)
+ if @segments.has_key?(x)
+ if !args.empty?
+ raise ArgumentError, "One arguments required if first segment name"
+ end
+ delete_segment(x)
+ elsif @paths.has_key?(x)
+ if !args.empty?
+ raise ArgumentError, "One argument required if first path name"
+ end
+ delete_path(x)
+ elsif x == :headers
+ if !args.empty?
+ raise ArgumentError, "One argument required if first :headers"
+ end
+ delete_headers
+ else
+ if respond_to?(x)
+ rm(send(x, *args))
+ else
+ raise ArgumentError, "Cannot remove #{x.inspect}"
+ end
+ end
+ elsif x.kind_of?(String)
+ rm(x.to_sym, *args)
+ elsif x.kind_of?(Array)
+ x.each {|elem| rm(elem, *args)}
+ elsif x.nil?
+ return self
+ else
+ raise ArgumentError, "Cannot remove #{x.inspect}"
+ end
+ return self
+ end
+
+ # Rename a segment or a path
+ #
+ # @param old_name [String] the name of the segment or path to rename
+ # @param new_name [String] the new name for the segment or path
+ #
+ # @raise[RGFA::DuplicatedLabelError]
+ # if +new_name+ is already a segment or path name
+ # @return [RGFA] self
+ def rename(old_name, new_name)
+ old_name = old_name.to_sym
+ new_name = new_name.to_sym
+ s = segment(old_name)
+ pt = nil
+ if s.nil?
+ pt = path(old_name)
+ if pt.nil?
+ raise RGFA::LineMissingError,
+ "#{old_name} is not a path or segment name"
+ end
+ end
+ if segment(new_name) or path(new_name)
+ raise RGFA::DuplicatedLabelError,
+ "#{new_name} is already a path or segment name"
+ end
+ if s
+ s.name = new_name
+ @segments.delete(old_name)
+ @segments[new_name] = s
+ else
+ pt.path_name = new_name
+ @paths.delete(old_name)
+ @paths[new_name] = pt
+ end
+ self
+ end
+
+ private
+
+ def lines
+ headers + segments + links + containments + paths
+ end
+
+ def each_line(&block)
+ lines.each(&block)
+ end
+
+end
+
+# Exception raised if a label for segment or path is duplicated
+class RGFA::DuplicatedLabelError < RGFA::Error; end
+
+# The error raised by banged line finders if no line respecting the criteria
+# exist in the RGFA
+class RGFA::LineMissingError < RGFA::Error; end
diff --git a/lib/rgfa/links.rb b/lib/rgfa/links.rb
new file mode 100644
index 0000000..824b75f
--- /dev/null
+++ b/lib/rgfa/links.rb
@@ -0,0 +1,242 @@
+require_relative "error"
+
+#
+# Methods for the RGFA class, which allow to handle links in the graph.
+#
+module RGFA::Links
+
+ def add_link(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ gfa_line.normalize!
+ l = nil
+ if segment(gfa_line.from) and segment(gfa_line.to)
+ l = link_from_to(gfa_line.oriented_from,
+ gfa_line.oriented_to,
+ gfa_line.overlap)
+ end
+ if l.nil?
+ @links << gfa_line
+ [:from, :to].each do |dir|
+ segment_name = gfa_line.send(dir).to_sym
+ orient = gfa_line.send(:"#{dir}_orient").to_sym
+ if !@segments.has_key?(segment_name)
+ raise RGFA::LineMissingError if @segments_first_order
+ @segments[segment_name] =
+ RGFA::Line::Segment.new({:name => segment_name},
+ virtual: true)
+ end
+ @segments[segment_name].links[dir][orient] << gfa_line
+ gfa_line.send(:"#{dir}=", @segments[segment_name])
+ end
+ elsif l.virtual?
+ l.real!(gfa_line)
+ else
+ return
+ end
+ end
+ protected :add_link
+
+ # Deletes a link and all paths depending on it
+ #
+ # @param l [RGFA::Line::Link] link instance
+ # @return [RGFA] self
+ def delete_link(l)
+ @links.delete(l)
+ segment(l.from).links[:from][l.from_orient].delete(l)
+ segment(l.to).links[:to][l.to_orient].delete(l)
+ l.paths.each {|pt, orient| delete_path(pt)}
+ end
+
+ # Remove all links of a segment end end except that to the other specified
+ # segment end.
+ # @param segment_end [RGFA::SegmentEnd] the segment end
+ # @param other_end [RGFA::SegmentEnd] the other segment end
+ # @param conserve_components [Boolean] <i>(defaults to: +false+)</i>
+ # Do not remove links if removing them breaks the graph into unconnected
+ # components.
+ # @return [RGFA] self
+ def delete_other_links(segment_end, other_end, conserve_components: false)
+ other_end = other_end.to_segment_end
+ links_of(segment_end).each do |l|
+ if l.other_end(segment_end) != other_end
+ if !conserve_components or !cut_link?(l)
+ delete_link(l)
+ end
+ end
+ end
+ end
+
+ # All links of the graph
+ # @return [Array<RGFA::Line::Link>]
+ def links
+ @links
+ end
+
+ # Finds links of the specified end of segment.
+ #
+ # @param [RGFA::SegmentEnd] segment_end a segment end
+ #
+ # @return [Array<RGFA::Line::Link>] if segment_end[1] == :E,
+ # links from sn with from_orient + and to sn with to_orient -
+ # @return [Array<RGFA::Line::Link>] if segment_end[1] == :B,
+ # links to sn with to_orient + and from sn with from_orient -
+ #
+ # @note to add or remove links, use the appropriate methods;
+ # adding or removing links from the returned array will not work
+ def links_of(segment_end)
+ segment_end = segment_end.to_segment_end
+ s = segment!(segment_end.segment)
+ o = segment_end.end_type == :E ? [:+,:-] : [:-,:+]
+ s.links[:from][o[0]] + s.links[:to][o[1]]
+ end
+
+ # Finds segment ends connected to the specified segment end.
+ #
+ # @param [RGFA::SegmentEnd] segment_end a segment end
+ #
+ # @return [Array<RGFA::SegmentEnd>>] segment ends connected by links
+ # to +segment_end+
+ def neighbours(segment_end)
+ links_of(segment_end).map {|l| l.other_end(segment_end) }
+ end
+
+ # Searches all links between +segment_end1+ and +segment_end2+
+ #
+ # @!macro [new] two_segment_ends
+ # @param segment_end1 [RGFA::SegmentEnd] a segment end
+ # @param segment_end2 [RGFA::SegmentEnd] a segment end
+ # @return [Array<RGFA::Line::Link>] (possibly empty)
+ def links_between(segment_end1, segment_end2)
+ segment_end1 = segment_end1.to_segment_end
+ segment_end2 = segment_end2.to_segment_end
+ links_of(segment_end1).select do |l|
+ l.other_end(segment_end1) == segment_end2
+ end
+ end
+
+ # @!macro [new] link
+ # Searches a link between +segment_end1+ and +segment_end2+
+ # @!macro two_segment_ends
+ # @return [RGFA::Line::Link] the first link found
+ # @return [nil] if no link is found.
+ def link(segment_end1, segment_end2)
+ segment_end1 = segment_end1.to_segment_end
+ segment_end2 = segment_end2.to_segment_end
+ links_of(segment_end1).each do |l|
+ return l if l.other_end(segment_end1) == segment_end2
+ end
+ return nil
+ end
+
+ # @!macro link
+ # @raise [RGFA::LineMissingError] if no link is found.
+ def link!(segment_end1, segment_end2)
+ l = link(segment_end1, segment_end2)
+ raise RGFA::LineMissingError,
+ "No link was found: "+
+ "#{segment_end1.to_s} -- "+
+ "#{segment_end2.to_s}" if l.nil?
+ l
+ end
+
+ # Find links from the segment in the specified orientation
+ # (or the equivalent links, i.e. to the segment in opposite orientation).
+ #
+ # @param [RGFA::OrientedSegment] oriented_segment a segment with orientation
+ # @param equivalent [Boolean] return also equivalent links.
+ # @return [Array<RGFA::Line::Link>]
+ # @note to add or remove links, use the appropriate methods;
+ # adding or removing links from the returned array will not work
+ def links_from(oriented_segment, equivalent = true)
+ oriented_segment = oriented_segment.to_oriented_segment
+ s = segment!(oriented_segment.segment)
+ retval = s.links[:from][oriented_segment.orient]
+ if equivalent
+ retval + s.links[:to][oriented_segment.orient_inverted]
+ else
+ retval
+ end
+ end
+
+ # Find links to the segment in the specified orientation
+ # (or the equivalent links, i.e. from the segment in opposite orientation).
+ #
+ # @param [RGFA::OrientedSegment] oriented_segment a segment with orientation
+ # @param equivalent [Boolean] return also equivalent links.
+ # @return [Array<RGFA::Line::Link>]
+ # @note to add or remove links, use the appropriate methods;
+ # adding or removing links from the returned array will not work
+ def links_to(oriented_segment, equivalent = true)
+ oriented_segment = oriented_segment.to_oriented_segment
+ s = segment!(oriented_segment.segment)
+ retval = s.links[:to][oriented_segment.orient]
+ if equivalent
+ retval + s.links[:from][oriented_segment.orient_inverted]
+ else
+ retval
+ end
+ end
+
+ # Search all links from a segment S1 in a given orientation
+ # to another segment S2 in a given, or the equivalent
+ # links from S2 to S1 with inverted orientations.
+ #
+ # @param [RGFA::OrientedSegment] oriented_segment1 a segment with orientation
+ # @param [RGFA::OrientedSegment] oriented_segment2 a segment with orientation
+ # @param [RGFA::CIGAR] cigar shall match if not empty/undef
+ # @param equivalent [Boolean] return also equivalent links.
+ # @return [Array<RGFA::Line::Link>]
+ # @note to add or remove links, use the appropriate methods;
+ # adding or removing links from the returned array will not work
+ def links_from_to(oriented_segment1, oriented_segment2,
+ cigar = [], equivalent = true)
+ oriented_segment1 = oriented_segment1.to_oriented_segment
+ oriented_segment2 = oriented_segment2.to_oriented_segment
+ links_from(oriented_segment1, equivalent).select do |l|
+ l.compatible?(oriented_segment1, oriented_segment2, cigar, equivalent)
+ end
+ end
+
+ # Search the link from a segment S1 in a given orientation
+ # to another segment S2 in a given, or the equivalent
+ # link from S2 to S1 with inverted orientations.
+ #
+ # @param [RGFA::OrientedSegment] oriented_segment1 a segment with orientation
+ # @param [RGFA::OrientedSegment] oriented_segment2 a segment with orientation
+ # @param [RGFA::CIGAR] cigar shall match if not empty/undef
+ # @param equivalent [Boolean] return also equivalent links.
+ # @return [RGFA::Line::Link] the first link found
+ # @return [nil] if no link is found.
+ def link_from_to(oriented_segment1, oriented_segment2,
+ cigar = [], equivalent = true)
+ oriented_segment1 = oriented_segment1.to_oriented_segment
+ oriented_segment2 = oriented_segment2.to_oriented_segment
+ links_from(oriented_segment1, equivalent).select do |l|
+ return l if l.compatible?(oriented_segment1, oriented_segment2,
+ cigar, equivalent)
+ end
+ return nil
+ end
+
+ # Search the link from a segment S1 in a given orientation
+ # to another segment S2 in a given, or the equivalent
+ # link from S2 to S1 with inverted orientations.
+ #
+ # @param [RGFA::OrientedSegment] oriented_segment1 a segment with orientation
+ # @param [RGFA::OrientedSegment] oriented_segment2 a segment with orientation
+ # @param [RGFA::CIGAR] cigar shall match if not empty/undef
+ # @param equivalent [Boolean] return also equivalent links.
+ # @return [RGFA::Line::Link] the first link found
+ # @raise [RGFA::LineMissingError] if no link is found.
+ def link_from_to!(oriented_segment1, oriented_segment2,
+ cigar = [], equivalent = true)
+ l = link_from_to(oriented_segment1, oriented_segment2,
+ cigar, equivalent)
+ raise RGFA::LineMissingError,
+ "No link was found: "+
+ "#{oriented_segment1.join(":")} -> "+
+ "#{oriented_segment2.join(":")}" if l.nil?
+ l
+ end
+
+end
diff --git a/lib/rgfa/logger.rb b/lib/rgfa/logger.rb
new file mode 100644
index 0000000..7ba2304
--- /dev/null
+++ b/lib/rgfa/logger.rb
@@ -0,0 +1,192 @@
+#
+# This class allows to output a message to the log file or STDERR and
+# to keep track of the progress of a method which takes long time to complete.
+#
+# @api private
+#
+class RGFA::Logger
+
+ # Information about the progress of a computation
+ ProgressData = Struct.new(:counter, :units, :partsize,
+ :lastpart, :total, :starttime,
+ :strlen)
+
+ # Create a Logger instance
+ #
+ # @param channel [#puts]
+ # where to output (default: STDERR)
+ # @param prefix [String]
+ # output prefix (default: "#")
+ # @param verbose_level [Integer]
+ # 0: no logging; >0: the higher, the more logging
+ # @return [RGFA::Logger]
+ def initialize(verbose_level: 1, channel: STDERR, prefix: "#")
+ @progress = false
+ if !verbose_level.kind_of?(Integer)
+ raise ArgumentError, "verbose_level must be an Integer"
+ end
+ if !channel.respond_to?(:puts)
+ raise TypeError, "channel must provide a puts method"
+ end
+ @channel = channel
+ @pfx = prefix
+ @verbose_level = verbose_level
+ @data = {}
+ end
+
+ # Output a message
+ #
+ # @param msg [String] message to output
+ # @param min_verbose_level [Integer]
+ # @return [void]
+ def log(msg, min_verbose_level=1)
+ @channel.puts "#@pfx #{msg}" if @verbose_level >= min_verbose_level
+ return nil
+ end
+
+ # Enable output from the Logger instance
+ #
+ # @param part [Float]
+ # - part = 0 => output at every call of {RGFA::Logger.progress_log}
+ # - 0 < part < 1 => output once per part of the total progress
+ # (e.g. 0.001 = log every 0.1% progress)
+ # - part = 1 => output only total elapsed time
+ # @return [void]
+ def enable_progress(part: 0.1)
+ if part < 0 or part > 1
+ raise ArgumentError, "part must be in range [0..1]"
+ end
+ @progress = true
+ @part = part
+ @channel.puts "#@pfx Progress logging enabled" if @verbose_level > 0
+ return nil
+ end
+
+ # Disable progress logging
+ # @return [void]
+ def disable_progress
+ @progress = false
+ @channel.puts "#@pfx Progress logging disabled" if @verbose_level > 0
+ return nil
+ end
+
+ # @!macro progress_init
+ # Initialize progress logging for a computation
+ # @param symbol [Symbol] a symbol assigned to the computation
+ # @param units [String] a string with the name of the units, in plural
+ # @param total [Integer] total number of units
+ # @param initmsg [String] an optional message to output at the beginning
+ # @return [void]
+ def progress_init(symbol, units, total, initmsg = nil)
+ return nil if !@progress or total == 0
+ str = "#@pfx 0.0% #{units} processed"
+ @data[symbol] = ProgressData.new(0, units, (@part*total).to_i, 1, total,
+ Time.now, str.size)
+ @channel.puts "#@pfx #{initmsg}" if initmsg
+ @channel.print str if @part != 1
+ return nil
+ end
+
+ # @!macro [new] progress_log
+ # Updates progress logging for a computation
+ # @!macro [new] prlog
+ # @param symbol [Symbol] the symbol assigned to the computation at
+ # init time
+ # @param keyargs [Hash] additional units to display, with their current
+ # value (e.g. segments_processed: 10000)
+ # @param progress [Integer] how many units were processed
+ # @return [void]
+ def progress_log(symbol, progress=1, **keyargs)
+ return nil if !@progress or @part == 1
+ data = @data[symbol]
+ return nil if data.nil?
+ data.counter += progress
+ if data.counter == data.total
+ progress_end(symbol)
+ elsif data.partsize == 0 or
+ (data.counter / data.partsize).to_i > data.lastpart
+ return nil if data.partsize == 0 and @part > 0
+ # this means total is very small
+ data.lastpart = data.counter / data.partsize if data.partsize > 0
+ done = data.counter.to_f / data.total
+ t = Time.now - data.starttime
+ eta = (t / done) - t
+ tstr= ("Elapsed: %02dh %02dmin %02ds" % [t/3600, t/60%60, t%60])
+ etastr = ("ETA: %02dh %02dmin %02ds" % [eta/3600, eta/60%60, eta%60])
+ donestr = "%.1f" % (done*100)
+ keystr = ""
+ keyargs.each {|k,v| keystr << "; #{k}: #{v}"}
+ str = "#@pfx #{donestr}% #{data.units} processed "+
+ "[#{tstr}; #{etastr}#{keystr}]"
+ if str.size > data.strlen
+ data.strlen = str.size
+ spacediff = ""
+ else
+ spacediff = " "*(data.strlen-str.size)
+ end
+ @channel.print "\r#{str}#{spacediff}"
+ @channel.flush
+ end
+ return nil
+ end
+
+ # @!macro [new] progress_end
+ # Completes progress logging for a computation
+ # @!macro prlog
+ # @return [void]
+ def progress_end(symbol, **keyargs)
+ return if !@progress
+ data = @data[symbol]
+ return if data.nil?
+ t = Time.now - data.starttime
+ tstr= ("Elapsed time: %02dh %02dmin %02ds" % [t/3600, t/60%60, t%60])
+ quantity = @part == 1 ? data.total.to_s : "100.0%"
+ keystr = ""
+ keyargs.each {|k,v| keystr << "; #{k}: #{v}"}
+ str = "#@pfx #{quantity} #{data.units} processed [#{tstr}#{keystr}]"
+ spacediff = " "*([data.strlen - str.size,0].max)
+ @channel.print "\r" if @part != 1
+ @channel.puts "#{str}#{spacediff}"
+ @channel.flush
+ @data.delete(symbol)
+ return nil
+ end
+
+end
+
+# Progress logging related-methods for RGFA class
+module RGFA::LoggerSupport
+
+ # Activate logging of progress
+ # @return [RGFA] self
+ def enable_progress_logging(part: 0.1, channel: STDERR)
+ @progress = RGFA::Logger.new(channel: channel)
+ @progress.enable_progress(part: part)
+ return self
+ end
+
+ # @!macro progress_init
+ # @return [RGFA] self
+ # @api private
+ def progress_log_init(symbol, units, total, initmsg = nil)
+ @progress.progress_init(symbol, units, total, initmsg) if @progress
+ return self
+ end
+
+ # @!macro progress_log
+ # @return [RGFA] self
+ # @api private
+ def progress_log(symbol, progress=1, **keyargs)
+ @progress.progress_log(symbol, progress) if @progress
+ return self
+ end
+
+ # @!macro progress_end
+ # @return [RGFA] self
+ # @api private
+ def progress_log_end(symbol, **keyargs)
+ @progress.progress_end(symbol) if @progress
+ return self
+ end
+
+end
diff --git a/lib/rgfa/multiplication.rb b/lib/rgfa/multiplication.rb
new file mode 100644
index 0000000..ccf7237
--- /dev/null
+++ b/lib/rgfa/multiplication.rb
@@ -0,0 +1,156 @@
+require_relative "error.rb"
+
+#
+# Method for the RGFA class, which allow to split a segment into
+# multiple copies.
+#
+module RGFA::Multiplication
+
+ # Create multiple copies of a segment.
+ #
+ # == Automatic computation of the copy names
+ #
+ # - Can be overridden, by providing an array of copy names.
+ # - First, it is checked if the name of the original segment ends with a
+ # relevant
+ # string, i.e. a lower case letter (for +:lowcase+), an upper case letter
+ # (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
+ # plus one or more optional digits (for +:copy+).
+ # - If so, it is assumed, it was already a copy, and it is not
+ # altered.
+ # - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
+ # +:number+), +_copy+ (for +:copy+) is appended to the string.
+ # - Then, in all
+ # cases, next (*) is called on the string, until a valid, non-existant name
+ # is found for each of the segment copies
+ # - (*) = except for +:copy+, where
+ # for the first copy no digit is present, but for the following is,
+ # i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
+ #
+ # @param [Integer] factor multiplication factor; if 0, delete the segment;
+ # if 1; do nothing; if > 1; number of copies to create
+ # @param segment [String, RGFA::Line::Segment] segment name or instance
+ # @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
+ # <i>(Defaults to: +:lowcase+)</i>
+ # Array of names for the copies of the segment,
+ # or a symbol, which defines a system to compute the names from the name of
+ # the original segment. See "automatic computation of the copy names".
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
+ # If factor == 0 (i.e. deletion), delete segment only if
+ # {#cut_segment?}(segment) is +false+.
+ #
+ # @return [RGFA] self
+ def multiply(segment, factor, copy_names: :lowcase,
+ conserve_components: true)
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
+ if factor < 2
+ return self if factor == 1
+ return self if cut_segment?(segment_name) and conserve_components
+ return delete_segment(segment_name)
+ end
+ s = segment!(segment_name)
+ divide_segment_and_connection_counts(s, factor)
+ copy_names = compute_copy_names(copy_names, segment_name, factor)
+ copy_names.each {|cn| clone_segment_and_connections(s, cn)}
+ return self
+ end
+
+ private
+
+ def compute_copy_names(copy_names, segment_name, factor)
+ return nil if factor < 2
+ accepted = [:lowcase, :upcase, :number, :copy]
+ if copy_names.kind_of?(Array)
+ return copy_names
+ elsif !accepted.include?(copy_names)
+ raise ArgumentError,
+ "copy_names shall be an array of names or one of: "+
+ accepted.inspect
+ end
+ retval = []
+ next_name = segment_name.to_s
+ case copy_names
+ when :lowcase
+ if next_name =~ /^.*[a-z]$/
+ next_name = next_name.next
+ else
+ next_name += "b"
+ end
+ when :upcase
+ if next_name =~ /^.*[A-Z]$/
+ next_name = next_name.next
+ else
+ next_name += "B"
+ end
+ when :number
+ if next_name =~ /^.*[0-9]$/
+ next_name = next_name.next
+ else
+ next_name += "2"
+ end
+ when :copy
+ if next_name =~ /^.*_copy(\d*)$/
+ next_name += "1" if $1 == ""
+ next_name = next_name.next
+ copy_names = :number
+ else
+ next_name += "_copy"
+ end
+ end
+ while retval.size < (factor-1)
+ while retval.include?(next_name) or
+ @segments.has_key?(next_name.to_sym) or
+ @paths.has_key?(next_name.to_sym)
+ if copy_names == :copy
+ next_name += "1"
+ copy_names = :number
+ end
+ next_name = next_name.next
+ end
+ retval << next_name
+ end
+ return retval
+ end
+
+ def divide_counts(gfa_line, factor)
+ [:KC, :RC, :FC].each do |count_tag|
+ if gfa_line.optional_fieldnames.include?(count_tag)
+ value = (gfa_line.get(count_tag).to_f / factor)
+ gfa_line.set(count_tag, value.to_i)
+ end
+ end
+ end
+
+ def divide_segment_and_connection_counts(segment, factor)
+ divide_counts(segment, factor)
+ [:links,:containments].each do |rt|
+ [:from,:to].each do |dir|
+ [:+, :-].each do |o|
+ segment.send(rt)[dir][o].each do |l|
+ # circular link counts shall be divided only ones
+ next if dir == :to and l.from == l.to
+ divide_counts(l, factor)
+ end
+ end
+ end
+ end
+ end
+
+ def clone_segment_and_connections(segment, clone_name)
+ cpy = segment.clone
+ cpy.name = clone_name
+ self << cpy
+ [:links,:containments].each do |rt|
+ [:from,:to].each do |dir|
+ [:+, :-].each do |o|
+ segment.send(rt)[dir][o].each do |l|
+ lc = l.clone
+ lc.set(dir, clone_name)
+ self << lc
+ end
+ end
+ end
+ end
+ end
+
+end
diff --git a/lib/rgfa/numeric_array.rb b/lib/rgfa/numeric_array.rb
new file mode 100644
index 0000000..a8cdaa4
--- /dev/null
+++ b/lib/rgfa/numeric_array.rb
@@ -0,0 +1,196 @@
+require_relative "error"
+
+#
+# A numeric array representable using the data type B of the GFA specification
+#
+class RGFA::NumericArray < Array
+
+ # Subtypes for signed integers, from the smallest to the largest
+ SIGNED_INT_SUBTYPE = %W[c s i]
+
+ # Subtypes for unsigned integers, from the smallest to the largest
+ UNSIGNED_INT_SUBTYPE = SIGNED_INT_SUBTYPE.map{|st|st.upcase}
+
+ # Subtypes for integers
+ INT_SUBTYPE = UNSIGNED_INT_SUBTYPE + SIGNED_INT_SUBTYPE
+
+ # Subtypes for floats
+ FLOAT_SUBTYPE = ["f"]
+
+ # Subtypes
+ SUBTYPE = INT_SUBTYPE + FLOAT_SUBTYPE
+
+ # Number of bits of unsigned integer subtypes
+ SUBTYPE_BITS = {"c" => 8, "s" => 16, "i" => 32}
+
+ # Range for integer subtypes
+ SUBTYPE_RANGE = Hash[
+ INT_SUBTYPE.map do |subtype|
+ [
+ subtype,
+ if subtype == subtype.upcase
+ 0..((2**SUBTYPE_BITS[subtype.downcase])-1)
+ else
+ (-(2**(SUBTYPE_BITS[subtype]-1)))..((2**(SUBTYPE_BITS[subtype]-1))-1)
+ end
+ ]
+ end
+ ]
+
+ # Validate the numeric array
+ #
+ # @raise [RGFA::NumericArray::ValueError] if the array is not valid
+ def validate!
+ compute_subtype
+ end
+
+ # Computes the subtype of the array from its content.
+ #
+ # If all elements are float, then the computed subtype is "f".
+ # If all elements are integer, the smallest possible numeric subtype
+ # is computed; thereby,
+ # if all elements are non-negative, an unsigned subtype is selected,
+ # otherwise a signed subtype.
+ # In all other cases an exception is raised.
+ #
+ # @raise [RGFA::NumericArray::ValueError] if the array is not a valid numeric
+ # array
+ # @return [RGFA::NumericArray::SUBTYPE]
+ def compute_subtype
+ if all? {|f|f.kind_of?(Float)}
+ return "f"
+ else
+ e_max = nil
+ e_min = nil
+ each do |e|
+ if !e.kind_of?(Integer)
+ raise RGFA::NumericArray::ValueError,
+ "NumericArray does not contain homogenous numeric values\n"+
+ "Content: #{inspect}"
+ end
+ e_max = e if e_max.nil? or e > e_max
+ e_min = e if e_min.nil? or e < e_min
+ end
+ return RGFA::NumericArray.integer_type(e_min..e_max)
+ end
+ end
+
+ # Computes the subtype for integers in a given range.
+ #
+ # If all elements are non-negative, an unsigned subtype is selected,
+ # otherwise a signed subtype.
+ #
+ # @param range [Range] the integer range
+ #
+ # @raise [RGFA::NumericArray::ValueError] if the integer range is outside
+ # all subtype ranges
+ #
+ # @return [RGFA::NumericArray::INT_SUBTYPE] subtype code
+ def self.integer_type(range)
+ if range.min < 0
+ SIGNED_INT_SUBTYPE.each do |st|
+ st_range = RGFA::NumericArray::SUBTYPE_RANGE[st]
+ if st_range.include?(range.min) and st_range.include?(range.max)
+ return st
+ end
+ end
+ else
+ UNSIGNED_INT_SUBTYPE.each do |st|
+ return st if range.max < RGFA::NumericArray::SUBTYPE_RANGE[st].max
+ end
+ end
+ raise RGFA::NumericArray::ValueError,
+ "NumericArray: values are outside of all integer subtype ranges\n"+
+ "Content: #{inspect}"
+ end
+
+ # Return self
+ # @param validate [Boolean] <i>(default: +false+)</i>
+ # if +true+, validate the range of the numeric values, according
+ # to the array subtype
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
+ # any value is not compatible with the subtype
+ # @return [RGFA::NumericArray]
+ def to_numeric_array(validate: false)
+ validate! if validate
+ self
+ end
+
+ # GFA datatype B representation of the numeric array
+ # @raise [RGFA::NumericArray::ValueError] if the array
+ # if not a valid numeric array
+ # @return [String]
+ def to_s
+ subtype = compute_subtype
+ "#{subtype},#{join(",")}"
+ end
+
+end
+
+# Exception raised if a value in a numeric array is not compatible
+# with the selected subtype
+class RGFA::NumericArray::ValueError < RGFA::Error; end
+
+# Exception raised if an invalid subtype code is found
+class RGFA::NumericArray::TypeError < RGFA::Error; end
+
+#
+# Method to create a numeric array from an array
+#
+class Array
+ # Create a numeric array from an Array instance
+ # @param validate [Boolean] <i>(default: +true+)</i>
+ # if +true+, validate the range of the numeric values, according
+ # to the array subtype
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
+ # any value is not compatible with the subtype
+ # @return [RGFA::NumericArray] the numeric array
+ def to_numeric_array(validate: true)
+ na = RGFA::NumericArray.new(self)
+ na.validate! if validate
+ na
+ end
+end
+
+#
+# Method to create a numeric array from a string
+#
+class String
+ # Create a numeric array from a string
+ # @param validate [Boolean] <i>(default: +true+)</i>
+ # if +true+, validate the range of the numeric values, according
+ # to the array subtype
+ # @raise [RGFA::NumericArray::ValueError] if validate is set and
+ # any value is not compatible with the subtype
+ # @raise [RGFA::NumericArray::TypeError] if the subtype code is invalid
+ # @return [RGFA::NumericArray] the numeric array
+ def to_numeric_array(validate: true)
+ elems = split(",")
+ subtype = elems.shift
+ integer = (subtype != "f")
+ if integer
+ range = RGFA::NumericArray::SUBTYPE_RANGE[subtype]
+ elsif !RGFA::NumericArray::SUBTYPE.include?(subtype)
+ raise RGFA::NumericArray::TypeError, "Subtype #{subtype} unknown"
+ end
+ elems.map do |e|
+ begin
+ if integer
+ e = Integer(e)
+ if validate and not range.include?(e)
+ raise "NumericArray: "+
+ "value is outside of subtype #{subtype} range\n"+
+ "Value: #{e}\n"+
+ "Range: #{range.inspect}\n"+
+ "Content: #{inspect}"
+ end
+ e
+ else
+ Float(e)
+ end
+ rescue => msg
+ raise RGFA::NumericArray::ValueError, msg
+ end
+ end
+ end
+end
diff --git a/lib/rgfa/paths.rb b/lib/rgfa/paths.rb
new file mode 100644
index 0000000..95b488b
--- /dev/null
+++ b/lib/rgfa/paths.rb
@@ -0,0 +1,98 @@
+require_relative "error"
+
+#
+# Methods for the RGFA class, which allow to handle paths in the graph.
+#
+module RGFA::Paths
+
+ def add_path(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ if @segments.has_key?(gfa_line.path_name)
+ raise RGFA::DuplicatedLabelError,
+ "Error when adding line: #{gfa_line}\n"+
+ "a segment already exists with the name: #{gfa_line.path_name}\n"+
+ "Segment: #{@segments[gfa_line.path_name]}"
+ elsif @paths.has_key?(gfa_line.path_name)
+ raise RGFA::DuplicatedLabelError,
+ "Error when adding line: #{gfa_line}\n"+
+ "a path already exists with the name: #{gfa_line.path_name}\n"+
+ "Path: #{@paths[gfa_line.path_name]}"
+ else
+ @paths[gfa_line.path_name] = gfa_line
+ gfa_line.required_links.each do |from,to,cigar|
+ l = nil
+ if segment(from.segment) and segment(to.segment)
+ l = link_from_to(from, to, cigar)
+ end
+ if l.nil?
+ v = RGFA::Line::Link.new({:from => from.segment,
+ :from_orient => from.orient,
+ :to => to.segment,
+ :to_orient => to.orient,
+ :overlap => cigar},
+ virtual: true)
+ if @segments_first_order
+ raise RGFA::LineMissingError, "Path: #{gfa_line}\n"+
+ "requires a non-existing link:\n"+
+ "#{v}"
+ end
+ add_link(v)
+ l = v
+ end
+ direct = l.compatible_direct?(from, to, cigar)
+ gfa_line.links << [l, direct]
+ l.paths << [gfa_line, direct]
+ end
+ gfa_line.segment_names.each do |sn_with_o|
+ sn_with_o[0] = segment(sn_with_o[0])
+ sn_with_o[0].paths[sn_with_o[1]] << gfa_line
+ end
+ end
+ end
+ protected :add_path
+
+ # Delete a path from the RGFA graph
+ # @return [RGFA] self
+ # @param pt [String, RGFA::Line::Path] path name or instance
+ def delete_path(pt)
+ pt = path!(pt)
+ pt.segment_names.each {|sn, o| segment!(sn).paths[o].delete(pt)}
+ pt.links.each {|l, dir| l.paths.delete([pt, dir])}
+ @paths.delete(pt.path_name)
+ return self
+ end
+
+ # All path lines of the graph
+ # @return [Array<RGFA::Line::Path>]
+ def paths
+ @paths.values
+ end
+
+ # @!macro [new] path
+ # Searches the path with name equal to +pt+.
+ # @param pt [String, RGFA::Line::Path] a path or path name
+ # @return [RGFA::Line::Path] if a path is found
+ # @return [nil] if no such path exists in the RGFA instance
+ #
+ def path(pt)
+ return pt if pt.kind_of?(RGFA::Line)
+ @paths[pt.to_sym]
+ end
+
+ # @!macro path
+ # @raise [RGFA::LineMissingError] if no such path exists in the RGFA instance
+ def path!(pt)
+ pt = path(pt)
+ raise RGFA::LineMissingError, "No path has name #{pt}" if pt.nil?
+ pt
+ end
+
+ # @return [Array<RGFA::Line::Path>] paths whose +segment_names+ include the
+ # specified segment.
+ # @!macro [new] segment_or_name
+ # @param s [RGFA::Line::Segment, Symbol] a segment instance or name
+ def paths_with(s)
+ segment!(s).all_paths
+ end
+
+end
diff --git a/lib/rgfa/rgl.rb b/lib/rgfa/rgl.rb
new file mode 100644
index 0000000..936f7a4
--- /dev/null
+++ b/lib/rgfa/rgl.rb
@@ -0,0 +1,194 @@
+begin
+ require "rgl/adjacency"
+ require "rgl/implicit"
+ require_relative "error"
+
+ #
+ # Conversion to RGL graphs
+ #
+ module RGFA::RGL
+
+ # Creates an RGL graph.
+ #
+ # @param oriented [Boolean] (defaults to: <i>+true+</i>) may the graph
+ # contain links of segments in different orientation?
+ # @return [RGL::ImplicitGraph] an rgl implicit directed graph
+ def to_rgl(oriented: true)
+ if oriented
+ to_rgl_oriented
+ else
+ to_rgl_unoriented
+ end
+ end
+
+ # Creates an RGL graph, including links orientations.
+ #
+ # @return [RGL::ImplicitGraph] an rgl implicit directed graph;
+ # where vertices are [RGFA::Segment, orientation] pairs
+ # (instances of the RGFA::OrientedSegment subclass of Array)
+ def to_rgl_oriented
+ RGL::ImplicitGraph.new do |g|
+ g.vertex_iterator do |block|
+ self.each_segment do |segment|
+ [:+, :-].each do |orient|
+ block.call([segment, orient].to_oriented_segment)
+ end
+ end
+ end
+ g.adjacent_iterator do |oriented_segment, block|
+ s = segment(oriented_segment.segment)
+ o = oriented_segment.orient
+ s.links[:from][o].each do |l|
+ os = [segment(l.to), l.to_orient].to_oriented_segment
+ block.call(os)
+ end
+ o = oriented_segment.invert_orient
+ s.links[:to][o].each do |l|
+ os = [segment(l.from), l.from_orient].to_oriented_segment
+ block.call(os.invert_orient)
+ end
+ end
+ g.directed = true
+ end
+ end
+
+ # Creates an RGL graph, assuming that all links orientations
+ # are "+".
+ #
+ # @raise [RGFA::RGL::ValueError] if the graph contains any link where
+ # from_orient or to_orient is :-
+ # @return [RGL::ImplicitGraph] an rgl implicit directed graph;
+ # where vertices are RGFA::Segment objects
+ def to_rgl_unoriented
+ RGL::ImplicitGraph.new do |g|
+ g.vertex_iterator {|block| self.each_segment {|s| block.call(s)}}
+ g.adjacent_iterator do |s, bl|
+ s = segment(s)
+ s.links[:from][:+].each do |l|
+ if l.to_orient == :-
+ raise RGFA::RGL::ValueError,
+ "Graph contains links with segments in reverse orientations"
+ end
+ bl.call(segment(l.to))
+ end
+ if s.links[:from][:-].size > 0
+ raise RGFA::RGL::ValueError,
+ "Graph contains links with segments in reverse orientations"
+ end
+ end
+ g.directed = true
+ end
+ end
+
+ def self.included(base)
+ base.extend(ClassMethods)
+ end
+
+ module ClassMethods
+
+ # @param g [RGL::ImplicitGraph, RGL::DirectedAdjacencyGraph] an RGL graph.
+ #
+ # @!macro[new] from_rgl
+ # <b>Accepted vertex formats</b>:
+ #
+ # - RGFA::OrientedSegment, or Array which can be converted to it;
+ # where the first element is a <i>segment specifier</i> (see below)
+ # - <i>segment specifier</i> alone: the orientation is assumed to be :+
+ #
+ # The <i>segment specifier</i> can be:
+ # - RGFA::Segment instance
+ # - String, segment representation (e.g. "S\tsegment\t*")
+ # - String, valid segment name (e.g. "segment")
+ #
+ # @raise [RGFA::RGL::InvalidFormatError] if the graph cannot be
+ # converted
+ #
+ # @return [RGFA] a new RGFA instance
+ def from_rgl(g)
+ gfa = RGFA.new
+ if not (g.respond_to?(:each_vertex) and
+ g.respond_to?(:each_edge))
+ raise RGFA::RGL::InvalidFormatError,
+ "#{g} is not a valid RGL graph"
+ end
+ if not g.directed?
+ raise RGFA::RGL::InvalidFormatError,
+ "#{g} is not a directed graph"
+ end
+ g.each_vertex {|v| add_segment_if_new(gfa, v)}
+ g.each_edge do |s, t|
+ gfa << RGFA::Line::Link.new(segment_name_and_orient(s) +
+ segment_name_and_orient(t) +
+ ["*"])
+ end
+ gfa
+ end
+
+ private
+
+ def add_segment_if_new(gfa, v)
+ # RGFA::OrientedSegment or GFA::GraphVertex
+ v = v.segment if v.respond_to?(:segment)
+ if v.kind_of?(Symbol)
+ # segment name as symbol
+ return if gfa.segment_names.include?(v)
+ v = RGFA::Line::Segment.new([v.to_s, "*"])
+ elsif v.kind_of?(String)
+ a = v.split("\t")
+ if a[0] == "S"
+ # string representation of segment
+ return if gfa.segment_names.include?(a[1].to_sym)
+ v = RGFA::Line::Segment.new(a[1..-1])
+ else
+ # segment name as string
+ return if gfa.segment_names.include?(v.to_sym)
+ v = RGFA::Line::Segment.new([v, "*"])
+ end
+ end
+ return if gfa.segment_names.include?(v.name)
+ gfa << v
+ end
+
+ def segment_name_and_orient(s)
+ # default orientation
+ o = s.respond_to?(:orient) ? s.orient.to_s : "+"
+ # RGFA::Line::Segment (also embedded in RGFA::OrientedSegment)
+ if s.respond_to?(:name)
+ s = s.name.to_s
+ elsif s.respond_to?(:segment)
+ # GFA::GraphVertex
+ s = s.segment.to_s
+ elsif s.respond_to?(:split)
+ a = s.split("\t")
+ s = a[1] if a[0] == "S"
+ else
+ s = s.to_s
+ end
+ return s, o
+ end
+
+ end
+
+ end
+
+ module RGL::Graph
+
+ # @!macro from_rgl
+ def to_rgfa
+ RGFA.from_rgl(self)
+ end
+
+ end
+
+ # Exception raised if conversion is impossible due to unexpected values
+ class RGFA::RGL::ValueError < RGFA::Error; end
+
+ # Exception raised if conversion is impossible due to general format problems
+ class RGFA::RGL::InvalidFormatError < RGFA::Error; end
+
+rescue LoadError
+
+ module RGFA::RGL
+ end
+
+end
diff --git a/lib/rgfa/segment_ends_path.rb b/lib/rgfa/segment_ends_path.rb
new file mode 100644
index 0000000..d06dd2b
--- /dev/null
+++ b/lib/rgfa/segment_ends_path.rb
@@ -0,0 +1,7 @@
+# An array containing {RGFA::SegmentEnd} elements, which defines a path
+# in the graph
+class RGFA::SegmentEndsPath < Array
+ def reverse
+ super.map {|segment_end| segment_end.to_segment_end.invert_end_type}
+ end
+end
diff --git a/lib/rgfa/segment_info.rb b/lib/rgfa/segment_info.rb
new file mode 100644
index 0000000..757b672
--- /dev/null
+++ b/lib/rgfa/segment_info.rb
@@ -0,0 +1,162 @@
+require_relative "error"
+
+# A segment or segment name plus an additional boolean attribute
+#
+# This class shall not be initialized directly.
+# @api private
+#
+class RGFA::SegmentInfo < Array
+
+ # Check that the elements of the array are compatible with the definition.
+ #
+ # @!macro [new] segment_info_validation_errors
+ # @raise [RGFA::SegmentInfo::InvalidSizeError] if size is not 2
+ # @raise [RGFA::SegmentInfo::InvalidAttributeError] if second element
+ # is not a valid info
+ # @return [void]
+ def validate!
+ if size != 2
+ raise RGFA::SegmentInfo::InvalidSizeError,
+ "Wrong n of elements, 2 expected (#{inspect})"
+ end
+ if !self.class::ATTR.include?(self[1])
+ raise RGFA::SegmentInfo::InvalidAttributeError,
+ "Invalid attribute (#{self[1].inspect})"
+ end
+ return nil
+ end
+
+ # @return [Symbol, RGFA::Line::Segment] the segment instance or name
+ def segment
+ self[0]
+ end
+
+ # Set the segment
+ # @param value [Symbol, RGFA::Line::Segment] the segment instance or name
+ # @return Symbol, RGFA::Line::Segment] +value+
+ def segment=(value)
+ self[0]=value
+ end
+
+ # @return [Symbol] the segment name
+ def name
+ self[0].kind_of?(RGFA::Line::Segment) ? self[0].name : self[0].to_sym
+ end
+
+ # @return [Symbol] the attribute
+ def attribute
+ self[1]
+ end
+
+ # Set the attribute
+ # @param value [Symbol] the attribute
+ # @return [Symbol] +value+
+ def attribute=(value)
+ self[1]=(value)
+ end
+
+ # @return [Symbol] the other possible value of the attribute
+ def attribute_inverted
+ self.class::ATTR[self.class::ATTR[0] == self[1] ? 1 : 0]
+ end
+
+ # @return [RGFA::SegmentInfo] same segment, inverted attribute
+ def invert_attribute
+ self.class.new([self[0], self.attribute_inverted])
+ end
+
+ # @param [Symbol] attribute an attribute value
+ # @return [Symbol] the other attribute value
+ def self.invert(attribute)
+ i = self::ATTR.index(attribute.to_sym)
+ if i.nil?
+ raise RGFA::SegmentInfo::InvalidAttributeError,
+ "Invalid attribute (#{self[1].inspect})"
+ end
+ return self::ATTR[i-1]
+ end
+
+ # @return [String] name of the segment and attribute
+ def to_s
+ "#{name}#{attribute}"
+ end
+
+ # @return [Symbol] name of the segment and attribute
+ def to_sym
+ to_s.to_sym
+ end
+
+ # Compare the segment names and attributes of two instances
+ #
+ # @param [RGFA::SegmentInfo] other the other instance
+ # @return [Boolean]
+ def ==(other)
+ to_s == other.to_segment_info(self.class).to_s
+ end
+
+ # Compare the segment names and attributes of two instances
+ #
+ # @param [RGFA::SegmentInfo] other the other instance
+ # @return [Boolean]
+ def <=>(other)
+ to_s <=> other.to_segment_info(self.class).to_s
+ end
+
+end
+
+# Error raised if the size of the array is wrong
+class RGFA::SegmentInfo::InvalidSizeError < RGFA::Error; end
+
+# Error raised if an unknown value for attribute is used
+class RGFA::SegmentInfo::InvalidAttributeError < RGFA::Error; end
+
+# A representation of a segment end
+class RGFA::SegmentEnd < RGFA::SegmentInfo
+ # Segment end type (begin or end)
+ ATTR = [ END_TYPE_BEGIN = :B, END_TYPE_END = :E ]
+ alias_method :end_type, :attribute
+ alias_method :end_type=, :attribute=
+ alias_method :invert_end_type, :invert_attribute
+ alias_method :end_type_inverted, :attribute_inverted
+end
+
+# A segment plus orientation
+class RGFA::OrientedSegment < RGFA::SegmentInfo
+ # Segment orientation
+ ATTR = [ ORIENT_FWD = :+, ORIENT_REV = :- ]
+ alias_method :orient, :attribute
+ alias_method :orient=, :attribute=
+ alias_method :invert_orient, :invert_attribute
+ alias_method :orient_inverted, :attribute_inverted
+end
+
+class Array
+
+ # Create and validate a segment end from an array
+ # @!macro segment_info_validation_errors
+ # @return [RGFA::SegmentEnd]
+ def to_segment_end
+ to_segment_info(RGFA::SegmentEnd)
+ end
+
+ # Create and validate a segment end from an array
+ # @!macro segment_info_validation_errors
+ # @return [RGFA::OrientedSegment]
+ def to_oriented_segment
+ to_segment_info(RGFA::OrientedSegment)
+ end
+
+ protected
+
+ def to_segment_info(subclass)
+ return self if self.kind_of?(subclass)
+ # support converting from gfa gem GraphVertex objects:
+ if respond_to?(:segment) and respond_to?(:orient)
+ return RGFA::OrientedSegment.new([segment.to_sym, orient.to_sym])
+ end
+ se = subclass.new(map {|e| e.kind_of?(String) ? e.to_sym : e})
+ se.validate!
+ return se
+ end
+
+end
diff --git a/lib/rgfa/segments.rb b/lib/rgfa/segments.rb
new file mode 100644
index 0000000..1743561
--- /dev/null
+++ b/lib/rgfa/segments.rb
@@ -0,0 +1,99 @@
+require_relative "error"
+
+#
+# Methods for the RGFA class, which allow to handle segments in the graph.
+#
+module RGFA::Segments
+
+ def add_segment(gfa_line)
+ gfa_line = gfa_line.to_rgfa_line(validate: @validate)
+ segment_name = gfa_line.name
+ if @paths.has_key?(segment_name)
+ raise RGFA::DuplicatedLabelError,
+ "Error when adding line: #{gfa_line}\n"+
+ "a path already exists with the name: #{segment_name}\n"+
+ "Path: #{@paths[segment_name]}"
+ elsif @segments.has_key?(segment_name)
+ if @segments[segment_name].virtual?
+ @segments[segment_name].real!(gfa_line)
+ else
+ raise RGFA::DuplicatedLabelError,
+ "Error when adding line: #{gfa_line}\n"+
+ "a segment already exists with the name: #{segment_name}\n"+
+ "Segment: #{@segments[segment_name]}"
+ end
+ else
+ @segments[segment_name] = gfa_line
+ end
+ end
+ protected :add_segment
+
+ # Delete a segment from the RGFA graph
+ # @return [RGFA] self
+ # @param s [String, RGFA::Line::Segment] segment name or instance
+ def delete_segment(s, cascade=true)
+ s = segment!(s)
+ if cascade
+ connected_segments(s).each {|cs| unconnect_segments(s, cs)}
+ [:+, :-].each do |o|
+ s.paths[o].each {|pt| delete_path(pt)}
+ end
+ end
+ @segments.delete(s.name)
+ return self
+ end
+
+ # All segment lines of the graph
+ # @return [Array<RGFA::Line::Segment>]
+ def segments
+ @segments.values
+ end
+
+ # @!macro [new] segment
+ # Searches the segment with name equal to +segment_name+.
+ # @param s [String, RGFA::Line::Segment] a segment or segment name
+ # @return [RGFA::Line::Segment] if a segment is found
+ # @return [nil] if no such segment exists in the RGFA instance
+ #
+ def segment(s)
+ return s if s.kind_of?(RGFA::Line)
+ @segments[s.to_sym]
+ end
+
+ # @!macro segment
+ # @raise [RGFA::LineMissingError] if no such segment exists
+ def segment!(s)
+ seg = segment(s)
+ if seg.nil?
+ raise RGFA::LineMissingError, "No segment has name #{s}"+
+ "#{segment_names.size < 10 ?
+ "\nSegment names: "+segment_names.inspect : ''}"
+ end
+ seg
+ end
+
+ # @return [Array<String>] list of names of segments connected to +segment+
+ # by links or containments
+ def connected_segments(segment)
+ (neighbours([segment, :B]).map{|s, e| s} +
+ neighbours([segment, :E]).map{|s, e| s} +
+ contained_in(segment).map{|c| c.to} +
+ containing(segment).map{|c| c.from}).uniq
+ end
+
+ # Delete all links/containments involving two segments
+ # @return [RGFA] self
+ # @param segment1 [String, RGFA::Line::Segment] segment 1 name or instance
+ # @param segment2 [String, RGFA::Line::Segment] segment 2 name or instance
+ def unconnect_segments(segment1, segment2)
+ containments_between(segment1, segment2).each {|c| delete_containment(c)}
+ containments_between(segment2, segment1).each {|c| delete_containment(c)}
+ [[:B, :E], [:B, :B], [:E, :B], [:E, :E]].each do |end1, end2|
+ links_between([segment1, end1], [segment2, end2]).each do |l|
+ delete_link(l)
+ end
+ end
+ return self
+ end
+
+end
diff --git a/lib/rgfa/sequence.rb b/lib/rgfa/sequence.rb
new file mode 100644
index 0000000..b5bf830
--- /dev/null
+++ b/lib/rgfa/sequence.rb
@@ -0,0 +1,65 @@
+#
+# Extensions of the String class to handle nucleotidic sequences
+#
+module RGFA::Sequence
+
+ # Computes the reverse complement of a nucleotidic sequence
+ #
+ # @return [String] reverse complement, without newlines and spaces
+ # @return [String] "*" if string is "*"
+ #
+ # @param tolerant [Boolean] <i>(defaults to: +false+)</i>
+ # if true, anything non-sequence is complemented to itself
+ # @param rnasequence [Boolean] <i>(defaults to: +false+)</i>
+ # if true, any A and a is complemented into u and U; otherwise
+ # it is so, only if an U is found; otherwise DNA is assumed
+ #
+ # @raise [RuntimeError] if not +tolerant+ and chars are found for which
+ # no Watson-Crick complement is defined
+ # @raise [RuntimeError] if sequence contains both U and T
+ #
+ # @example
+ # "ACTG".rc # => "CAGT"
+ # "acGT".rc # => "ACgt"
+ # @example Undefined sequence is represented by "*":
+ # "*".rc # => "*"
+ # @example Extended IUPAC Alphabet:
+ # "ARBN".rc # => "NVYT"
+ # @example Usage with RNA sequences:
+ # "ACUG".rc # => "CAGU"
+ # "ACG".rc(rnasequence: true) # => "CGU"
+ # "ACUT".rc # (raises RuntimeError, both U and T)
+ def rc(tolerant: false, rnasequence: false)
+ return "*" if self == "*"
+ retval = each_char.map do |c|
+ if c == "U" or c == "u"
+ rnasequence = true
+ elsif rnasequence and (c == "T" or c == "t")
+ raise "String contains both U/u and T/t"
+ end
+ wcc = WCC.fetch(c, tolerant ? c : nil)
+ raise "#{self}: no Watson-Crick complement for #{c}" if wcc.nil?
+ wcc
+ end.reverse.join
+ if rnasequence
+ retval.tr!("tT","uU")
+ end
+ retval
+ end
+
+ # Watson-Crick Complements
+ WCC = {"a"=>"t","t"=>"a","A"=>"T","T"=>"A",
+ "c"=>"g","g"=>"c","C"=>"G","G"=>"C",
+ "b"=>"v","B"=>"V","v"=>"b","V"=>"B",
+ "h"=>"d","H"=>"D","d"=>"h","D"=>"H",
+ "R"=>"Y","Y"=>"R","r"=>"y","y"=>"r",
+ "K"=>"M","M"=>"K","k"=>"m","m"=>"k",
+ "S"=>"S","s"=>"s","w"=>"w","W"=>"W",
+ "n"=>"n","N"=>"N","u"=>"a","U"=>"A",
+ "-"=>"-","."=>".","="=>"=",
+ " "=>"","\n"=>""}
+end
+
+class String
+ include RGFA::Sequence
+end
diff --git a/lib/rgfatools.rb b/lib/rgfatools.rb
new file mode 100644
index 0000000..332a72a
--- /dev/null
+++ b/lib/rgfatools.rb
@@ -0,0 +1,102 @@
+require "rgfa"
+
+#
+# Module defining additional methods for the RGFA class.
+#
+# RGFATools is an extension to the RGFA library, which allow to perform further
+# operations. Thereby additional conventions are required, with respect to the
+# GFA specification, which are compatible with it.
+#
+# The methods defined here allow, e.g., to randomly orient a segment which has
+# the same connections on both sides, to compute copy numbers and multiply or
+# delete segments according to them, to distribute the links of copies after
+# multipling a segment, or to eliminate edges in the graph which are
+# incompatible with an hamiltonian path.
+#
+# Custom optional fields are defined, such as "cn" for the copy number of a
+# segment, "or" for the original segment(s) of a duplicated or merged segment,
+# "mp" for the starting position of original segments in a merged segment, "rp"
+# for the position of possible inversions due to arbitrary orientation of some
+# segments by the program.
+#
+# Furthermore a convention for the naming of the segments is introduced, which
+# gives a special meaning to the characters "_^()".
+#
+# @developer
+# In the main file is only the method redefinition infrastructure
+# (private methods). The public methods are in the included modules.
+#
+module RGFATools
+end
+
+require_relative "rgfatools/artifacts"
+require_relative "rgfatools/copy_number"
+require_relative "rgfatools/invertible_segments"
+require_relative "rgfatools/multiplication"
+require_relative "rgfatools/superfluous_links"
+require_relative "rgfatools/linear_paths"
+require_relative "rgfatools/p_bubbles"
+
+module RGFATools
+
+ include RGFATools::Artifacts
+ include RGFATools::CopyNumber
+ include RGFATools::InvertibleSegments
+ include RGFATools::Multiplication
+ include RGFATools::SuperfluousLinks
+ include RGFATools::LinearPaths
+ include RGFATools::PBubbles
+
+ private
+
+ def self.included(klass)
+ included_modules.each do |included_module|
+ if included_module.const_defined?("Redefined")
+ self.redefine_methods(included_module::Redefined, klass)
+ end
+ if included_module.const_defined?("ClassMethods")
+ klass.extend(included_module::ClassMethods)
+ end
+ end
+ end
+
+ def self.redefine_methods(redefined_methods, klass)
+ klass.class_eval do
+ redefined_methods.each do |redefined_method|
+ was_private = klass.private_instance_methods.include?(redefined_method)
+ public redefined_method
+ alias_method :"#{redefined_method}_without_rgfatools", redefined_method
+ alias_method redefined_method, :"#{redefined_method}_with_rgfatools"
+ if was_private
+ private redefined_method,
+ :"#{redefined_method}_without_rgfatools",
+ :"#{redefined_method}_with_rgfatools"
+ end
+ end
+ end
+ end
+
+ ProgramName = "RGFATools"
+
+ def add_program_name_to_header
+ set_header_field(:pn, RGFATools::ProgramName)
+ end
+
+end
+
+class RGFA
+ include RGFATools
+
+ # Enable {RGFATools} extensions of RGFA methods
+ # @return [void]
+ def enable_extensions
+ @extensions_enabled = true
+ end
+
+ # Disable {RGFATools} extensions of RGFA methods
+ # @return [void]
+ def disable_extensions
+ @extensions_enabled = false
+ end
+
+end
diff --git a/lib/rgfatools/artifacts.rb b/lib/rgfatools/artifacts.rb
new file mode 100644
index 0000000..073d0d9
--- /dev/null
+++ b/lib/rgfatools/artifacts.rb
@@ -0,0 +1,29 @@
+#
+# Methods which edit the graph components without traversal
+#
+module RGFATools::Artifacts
+
+ # Remove connected components whose sum of lengths of the segments
+ # is under a specified value.
+ # @param minlen [Integer] the minimum length
+ # @return [RGFA] self
+ def remove_small_components(minlen)
+ rm(connected_components.select {|cc|
+ cc.map{|sn|segment(sn).length}.reduce(:+) < minlen })
+ self
+ end
+
+ # Remove end segments, whose sequence length is under a specified value.
+ # @param minlen [Integer] the minimum length
+ # @return [RGFA] self
+ def remove_dead_ends(minlen)
+ segments.each do |s|
+ c = connectivity(s)
+ rm(s) if s.length < minlen and
+ (c[0] == 0 or c[1] == 0) and
+ !cut_segment?(s)
+ end
+ self
+ end
+
+end
diff --git a/lib/rgfatools/copy_number.rb b/lib/rgfatools/copy_number.rb
new file mode 100644
index 0000000..a24801c
--- /dev/null
+++ b/lib/rgfatools/copy_number.rb
@@ -0,0 +1,126 @@
+#
+# Methods which edit the graph components without traversal
+#
+module RGFATools::CopyNumber
+
+ # Sets the count tag to use as default by coverage computations
+ # <i>(defaults to: +:RC+)</i>.
+ #
+ # @return [RGFA] self
+ # @param tag [Symbol] the tag to use
+ def set_default_count_tag(tag)
+ @default[:count_tag] = tag
+ return self
+ end
+
+ # Sets the unit length (k-mer size, average read lenght or average fragment
+ # length) to use for coverage computation
+ # <i>(defaults to: 1)</i>.
+ #
+ # @param unit_length [Integer] the unit length to use
+ # @return [RGFA] self
+ def set_count_unit_length(unit_length)
+ @default[:unit_length] = unit_length
+ return self
+ end
+
+ # Delete segments which have a coverage under a specified value.
+ #
+ # @param mincov [Integer] the minimum coverage
+ # @!macro [new] count_tag
+ # @param count_tag [Symbol] <i>(defaults to: +:RC+ or the value set by
+ # {#set_default_count_tag})</i> the count tag to use for coverage
+ # computation
+ # @!macro [new] unit_length
+ # @param unit_length [Integer] <i>(defaults to: 1 or the value set by
+ # {#set_count_unit_length})</i> the unit length to use for coverage
+ # computation
+ #
+ # @return [RGFA] self
+ def delete_low_coverage_segments(mincov,
+ count_tag: @default[:count_tag],
+ unit_length: @default[:unit_length])
+ segments.map do |s|
+ cov = s.coverage(count_tag: count_tag,
+ unit_length: unit_length)
+ cov < mincov ? s.name : nil
+ end.compact.each do |sn|
+ delete_segment(sn)
+ end
+ self
+ end
+
+ # @param mincov [Integer] <i>(defaults to: 1/4 of +single_copy_coverage+)</i>
+ # the minimum coverage, cn for segments under this value is set to 0
+ # @param single_copy_coverage [Integer]
+ # the coverage that shall be considered to be single copy
+ # @param cn_tag [Symbol] <i>(defaults to: +:cn+)</i>
+ # the tag to use for storing the copy number
+ # @!macro count_tag
+ # @!macro unit_length
+ # @return [RGFA] self
+ def compute_copy_numbers(single_copy_coverage,
+ mincov: single_copy_coverage * 0.25,
+ count_tag: @default[:count_tag],
+ cn_tag: :cn,
+ unit_length: @default[:unit_length])
+ segments.each do |s|
+ cov = s.coverage!(count_tag: count_tag, unit_length: unit_length).to_f
+ if cov < mincov
+ cn = 0
+ elsif cov < single_copy_coverage
+ cn = 1
+ else
+ cn = (cov / single_copy_coverage).round
+ end
+ s.set(cn_tag, cn)
+ end
+ self
+ end
+
+ # Applies the computed copy number to a segment
+ # @!macro [new] apply_copy_number
+ # @return [RGFA] self
+ # @param [:lowcase, :upcase, :number, :copy] copy_names_suffix
+ # <i>(Defaults to: +:lowcase+)</i>
+ # Symbol representing a system to compute the names from the name of
+ # the original segment. See "Automatic computation of the copy names"
+ # in #multiply_extended.
+ # @param count_tag [Symbol] tag to use for storing the copy number
+ # (default: cn)
+ # @param distribute [Symbol] distribution policy, see #multiply_extended
+ # @param origin_tag [Symbol] tag to use for storing the origin (default: or)
+ # @param conserve_components [Boolean] when factor is 0,
+ # do not remove segments if doing so increases the number of components
+ # in the graph (default: true)
+ # @param segment [RGFA::Line::Segment, Symbol] segment or segment name
+ def apply_copy_number(segment, count_tag: :cn,
+ distribute: :auto,
+ copy_names_suffix: :lowcase, origin_tag: :or,
+ conserve_components: true)
+ s, sn = segment_and_segment_name(segment)
+ factor = s.get!(count_tag)
+ multiply_extended(sn, factor,
+ distribute: distribute,
+ copy_names: copy_names_suffix,
+ conserve_components: conserve_components,
+ origin_tag: origin_tag)
+ self
+ end
+
+ # Applies the computed copy number to all segments
+ # @!macro apply_copy_number
+ def apply_copy_numbers(count_tag: :cn, distribute: :auto,
+ copy_names_suffix: :lowcase, origin_tag: :or,
+ conserve_components: true)
+ segments.sort_by{|s|s.get!(count_tag)}.each do |s|
+ multiply_extended(s.name, s.get(count_tag),
+ distribute: distribute,
+ copy_names: copy_names_suffix,
+ conserve_components: conserve_components,
+ origin_tag: origin_tag)
+ end
+ self
+ end
+
+end
diff --git a/lib/rgfatools/invertible_segments.rb b/lib/rgfatools/invertible_segments.rb
new file mode 100644
index 0000000..7f02cfe
--- /dev/null
+++ b/lib/rgfatools/invertible_segments.rb
@@ -0,0 +1,104 @@
+#
+# Methods which edit the graph components without traversal
+#
+module RGFATools::InvertibleSegments
+
+ # Selects a random orientation for all invertible segments
+ # @return [RGFA] self
+ def randomly_orient_invertibles
+ segment_names.each do |sn|
+ if segment_same_links_both_ends?(sn)
+ randomly_orient_proven_invertible_segment(sn)
+ end
+ end
+ self
+ end
+
+ # Selects a random orientation for an invertible segment
+ # @return [RGFA] self
+ # @!macro segment_param
+ def randomly_orient_invertible(segment)
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
+ if !segment_same_links_both_ends?(segment_name)
+ raise "Only segments with links to the same or equivalent segments "+
+ "at both ends can be randomly oriented"
+ end
+ randomly_orient_proven_invertible_segment(segment_name)
+ self
+ end
+
+ private
+
+ def randomly_orient_proven_invertible_segment(segment_name)
+ parts = partitioned_links_of([segment_name, :E])
+ if parts.size == 2
+ tokeep1_other_end = parts[0][0].other_end([segment_name, :E])
+ tokeep2_other_end = parts[1][0].other_end([segment_name, :E])
+ elsif parts.size == 1 and parts[0].size == 2
+ tokeep1_other_end = parts[0][0].other_end([segment_name, :E])
+ tokeep2_other_end = parts[0][1].other_end([segment_name, :E])
+ else
+ return
+ end
+ return if links_of(tokeep1_other_end).size < 2
+ return if links_of(tokeep2_other_end).size < 2
+ delete_other_links([segment_name, :E], tokeep1_other_end)
+ delete_other_links([segment_name, :B], tokeep2_other_end)
+ annotate_random_orientation(segment_name)
+ end
+
+ def link_targets_for_cmp(segment_end)
+ links_of(segment_end).map {|l| l.other_end(segment_end).join}
+ end
+
+ def segment_same_links_both_ends?(segment_name)
+ e_links = link_targets_for_cmp([segment_name, :E])
+ b_links = link_targets_for_cmp([segment_name, :B])
+ return e_links == b_links
+ end
+
+ def segment_signature(segment_end)
+ s = segment!(segment_end[0])
+ link_targets_for_cmp(segment_end).join(",")+"\t"+
+ link_targets_for_cmp(segment_end.invert_end_type).join(",")+"\t"+
+ [:or].map do |field|
+ s.send(field)
+ end.join("\t")
+ end
+
+ def partitioned_links_of(segment_end)
+ links_of(segment_end).group_by do |l|
+ other_end = l.other_end(segment_end)
+ sig = segment_signature(other_end)
+ sig
+ end.map {|sig, par| par}
+ end
+
+ def annotate_random_orientation(segment_name)
+ segment = segment!(segment_name)
+ n = segment.name.to_s.split("_")
+ pairs = 0
+ pos = [1, segment.LN]
+ if segment.or
+ o = segment.or.to_s.split(",")
+ if o.size > 2
+ while o.last == o.first + "^" or o.last + "^" == o.first
+ pairs += 1
+ o.pop
+ o.shift
+ end
+ end
+ if segment.mp
+ pos = [segment.mp[pairs*2], segment.mp[-1-pairs*2]]
+ end
+ end
+ rn = segment.rn
+ rn ||= []
+ rn += pos
+ segment.rn = rn
+ n[pairs] = "(" + n[pairs]
+ n[-1-pairs] = n[-1-pairs] + ")"
+ rename(segment.name, n.join("_"))
+ end
+
+end
diff --git a/lib/rgfatools/linear_paths.rb b/lib/rgfatools/linear_paths.rb
new file mode 100644
index 0000000..bb70f65
--- /dev/null
+++ b/lib/rgfatools/linear_paths.rb
@@ -0,0 +1,140 @@
+#
+# Methods for the RGFA class, which involve a traversal of the graph following
+# links
+#
+module RGFATools::LinearPaths
+
+ # @!method merge_linear_path(segpath, **options)
+ # Merge a linear path, i.e. a path of segments without extra-branches.
+ # @!macro [new] merge_more
+ # Extends the RGFA method, with additional functionality:
+ # - +name+: the name of the merged segment is set to the name of the
+ # single segments joined by underscore (+_+). If a name already
+ # contained an underscore, it is splitted before merging. Whenever a
+ # segment is reversed complemented, its name (or the name of all its
+ # components) is suffixed with a +^+; if the last letter was already
+ # +^+, it is removed; if it contained +_+ the name is splitted, the
+ # elements reversed and joined back using +_+; round parentheses are
+ # removed from the name before processing and added back after it.
+ # - +:or+: keeps track of the origin of the merged segment; the
+ # origin tag is set to an array of :or or name
+ # (if no :or available) tags of the segment which have been merged;
+ # the character +^+ is assigned the same meaning as in +name+
+ # - +:rn+: tag used to store possible inversion positions and
+ # it is updated by this method; i.e. it is passed from the single
+ # segments to the merged segment, and the coordinates updated
+ # - +:mp+: tag used to store the position of the
+ # single segments in the merged segment; it is created or updated by
+ # this method
+ # Note that the extensions to the original method will only be run
+ # if either #enable_extensions has been called on RGFA object
+ # or the enable_tracking parameter is set..
+ # After calling #enable_extensions, you may still obtain the original
+ # behaviour by setting the disable_tracking parameter.
+ # @!macro merge_more
+ #
+ # @!macro [new] merge_lim
+ #
+ # Limitations: all containments und paths involving merged segments are
+ # deleted.
+ # @!macro merge_lim
+ #
+ # @param segpath [Array<RGFA::SegmentEnd>] a linear path, such as that
+ # retrieved by #linear_path (see RGFA API documentation)
+ # @!macro [new] merge_options
+ # @param options [Hash] optional keyword arguments
+ # @option options [String, :short, nil] :merged_name (nil)
+ # if nil, the merged_name is automatically computed; if :short,
+ # a name is computed starting with "merged1" and calling next until
+ # an available name is founf; if String, the name to use
+ # @option options [Boolean] :cut_counts (false)
+ # if true, total count in merged segment m, composed of segments
+ # s of set S is multiplied by the factor Sum(|s in S|)/|m|
+ # @option options [Boolean] :enable_tracking (false)
+ # if true, the extended method with RGFATools is called,
+ # no matter if RGFA#enable_extensions was called.
+ # @option options [Boolean] :disable_tracking (false)
+ # if true, the original method of RGFA without RGFATools is called,
+ # no matter if RGFA#enable_extensions was called.
+ # @!macro merge_options
+ #
+ # @return [RGFA] self
+ # @see #merge_linear_paths
+
+ # @!method merge_linear_paths(**options)
+ # Merge all linear paths in the graph, i.e.
+ # paths of segments without extra-branches
+ # @!macro merge_more
+ # @!macro merge_lim
+ # @!macro merge_options
+ #
+ # @return [RGFA] self
+
+ private
+
+ Redefined = [:add_segment_to_merged]
+
+ def reverse_segment_name(name, separator)
+ name.to_s.split(separator).map do |part|
+ openp = part[0] == "("
+ part = part[1..-1] if openp
+ closep = part[-1] == ")"
+ part = part[0..-2] if closep
+ part = (part[-1] == "^") ? part[0..-2] : part+"^"
+ part += ")" if openp
+ part = "(#{part}" if closep
+ part
+ end.reverse.join(separator)
+ end
+
+ def reverse_pos_array(pos_array, lastpos)
+ return nil if pos_array.nil? or lastpos.nil?
+ pos_array.map {|pos| lastpos - pos + 1}.reverse
+ end
+
+ def add_segment_to_merged_with_rgfatools(merged, segment, reversed, cut, init,
+ options)
+ if !options[:enable_tracking] and
+ (options[:disable_tracking] or !@extensions_enabled)
+ return add_segment_to_merged_without_rgfatools(merged, segment, reversed,
+ cut, init, options)
+ end
+ s = (reversed ? segment.sequence.rc[cut..-1] : segment.sequence[cut..-1])
+ n = (reversed ? reverse_segment_name(segment.name, "_") : segment.name.to_s)
+ rn = (reversed ? reverse_pos_array(segment.rn, segment.LN) : segment.rn)
+ mp = (reversed ? reverse_pos_array(segment.mp, segment.LN) : segment.mp)
+ mp = [1, segment.LN] if mp.nil? and segment.LN
+ if segment.or.nil?
+ o = n
+ else
+ o = (reversed ? reverse_segment_name(segment.or, ",") : segment.or)
+ end
+ if init
+ merged.sequence = s
+ merged.name = options[:merged_name].nil? ? n : options[:merged_name]
+ merged.LN = segment.LN
+ merged.rn = rn
+ merged.or = o
+ merged.mp = mp
+ else
+ (segment.sequence == "*") ? (merged.sequence = "*")
+ : (merged.sequence += s)
+ merged.name = "#{merged.name}_#{n}" if options[:merged_name].nil?
+ if merged.LN
+ if rn
+ rn = rn.map {|pos| pos - cut + merged.LN}
+ merged.rn = merged.rn.nil? ? rn : merged.rn + rn
+ end
+ if mp and merged.mp
+ merged.mp += mp.map {|pos| pos - cut + merged.LN}
+ end
+ segment.LN ? merged.LN += (segment.LN - cut)
+ : merged.LN = nil
+ else
+ merged.mp = nil
+ end
+ merged.or = merged.or.nil? ? o : "#{merged.or},#{o}"
+ end
+ end
+
+end
diff --git a/lib/rgfatools/multiplication.rb b/lib/rgfatools/multiplication.rb
new file mode 100644
index 0000000..17e070a
--- /dev/null
+++ b/lib/rgfatools/multiplication.rb
@@ -0,0 +1,194 @@
+#
+# Methods which edit the graph components without traversal
+#
+module RGFATools::Multiplication
+
+ # Allowed values for the links_distribution_policy option
+ LINKS_DISTRIBUTION_POLICY = [:off, :auto, :equal, :E, :B]
+
+ # @overload multiply(segment, factor, copy_names: :lowcase, distribute: :auto, conserve_components: true, origin_tag: :or)
+ # Create multiple copies of a segment.
+ #
+ # Complements the multiply method of gfatools with additional functionality.
+ # These extensions are used only after #enable_extensions is called on the
+ # RGFA object. After that, you may still call the original method
+ # using #multiply_without_rgfatools.
+ #
+ # For more information on the additional functionality, see
+ # #multiply_extended.
+ #
+ # @return [RGFA] self
+ def multiply_with_rgfatools(segment, factor,
+ copy_names: :lowcase,
+ distribute: :auto,
+ conserve_components: true,
+ origin_tag: :or)
+ if !@extensions_enabled
+ return multiply_without_rgfatools(segment, factor,
+ copy_names: copy_names,
+ conserve_components: conserve_components)
+ else
+ multiply_extended(segment, factor,
+ copy_names: copy_names,
+ distribute: distribute,
+ conserve_components: conserve_components,
+ origin_tag: origin_tag)
+ end
+ end
+
+ # Create multiple copies of a segment.
+ #
+ # Complements the multiply method of gfatools with additional functionality.
+ # To always run the additional functionality when multiply is called,
+ # use RGFA#enable_extensions.
+ #
+ # @!macro [new] copynames_text
+ #
+ # <b>Automatic computation of the copy names:</b>
+ #
+ # - First, itis checked if the name of the original segment ends with a
+ # relevant
+ # string, i.e. a lower case letter (for +:lowcase+), an upper case letter
+ # (for +:upcase+), a digit (for +:number+), or the string +"_copy"+
+ # plus one or more optional digits (for +:copy+).
+ # - If so, it is assumed, it was already a copy, and it is not
+ # altered.
+ # - If not, then +a+ (for +:lowcase+), +A+ (for +:upcase+), +1+ (for
+ # +:number+), +_copy+ (for +:copy+) is appended to the string.
+ # - Then, in all
+ # cases, next (*) is called on the string, until a valid, non-existant
+ # name is found for each of the segment copies
+ # - (*) = except for +:copy+, where
+ # for the first copy no digit is present, but for the following is,
+ # i.e. the segment names will be +:copy+, +:copy2+, +:copy3+, etc.
+ # - Can be overridden, by providing an array of copy names.
+ #
+ # @!macro [new] ldp_text
+ #
+ # <b>Links distribution policy</b>
+ #
+ # Depending on the value of the option +distribute+, an end
+ # is eventually selected for distribution of the links.
+ #
+ # - +:off+: no distribution performed
+ # - +:E+: links of the E end are distributed
+ # - +:B+: links of the B end are distributed
+ # - +:equal+: select an end for which the number of links is equal to
+ # +factor+, if any; if both, then the E end is selected
+ # - +:auto+: automatically select E or B, trying to maximize the number of
+ # links which can be deleted
+ #
+ # @param [Integer] factor multiplication factor; if 0, delete the segment;
+ # if 1; do nothing; if > 1; number of copies to create
+ # @!macro [new] segment_param
+ # @param segment [String, RGFA::Line::Segment] segment name or instance
+ # @param [:lowcase, :upcase, :number, :copy, Array<String>] copy_names
+ # <i>(Defaults to: +:lowcase+)</i>
+ # Array of names for the copies of the segment,
+ # or a symbol, which defines a system to compute the names from the name of
+ # the original segment. See "Automatic computation of the copy names".
+ # @!macro [new] conserve_components
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
+ # If factor == 0 (i.e. deletion), delete segment only if
+ # #cut_segment?(segment) is +false+ (see RGFA API).
+ # @!macro [new] ldp_param
+ # @param distribute
+ # [RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY]
+ # <i>(Defaults to: +:auto+)</i>
+ # Determines if and for which end of the segment, links are distributed
+ # among the copies. See "Links distribution policy".
+ # @!macro [new] origin_tag
+ # @param origin_tag [Symbol] <i>(Defaults to: +:or+)</i>
+ # Name of the custom tag to use for storing origin information.
+ #
+ # @return [RGFA] self
+ def multiply_extended(segment, factor,
+ copy_names: :lowcase,
+ distribute: :auto,
+ conserve_components: true,
+ origin_tag: :or)
+ s, sn = segment_and_segment_name(segment)
+ s.set(origin_tag, sn) if !s.get(origin_tag)
+ copy_names = compute_copy_names(copy_names, sn, factor)
+ multiply_without_rgfatools(sn, factor,
+ copy_names: copy_names,
+ conserve_components: conserve_components)
+ distribute_links(distribute, sn, copy_names, factor)
+ return self
+ end
+
+ private
+
+ Redefined = [:multiply]
+
+ def select_distribute_end(links_distribution_policy, segment_name, factor)
+ accepted = RGFATools::Multiplication::LINKS_DISTRIBUTION_POLICY
+ if !accepted.include?(links_distribution_policy)
+ raise "Unknown links distribution policy #{links_distribution_policy}, "+
+ "accepted values are: "+
+ accepted.inspect
+ end
+ return nil if links_distribution_policy == :off
+ if [:B, :E].include?(links_distribution_policy)
+ return links_distribution_policy
+ end
+ esize = links_of([segment_name, :E]).size
+ bsize = links_of([segment_name, :B]).size
+ auto_select_distribute_end(factor, bsize, esize,
+ links_distribution_policy == :equal)
+ end
+
+ # (keep separate for testing)
+ def auto_select_distribute_end(factor, bsize, esize, equal_only)
+ if esize == factor
+ return :E
+ elsif bsize == factor
+ return :B
+ elsif equal_only
+ return nil
+ elsif esize < 2
+ return (bsize < 2) ? nil : :B
+ elsif bsize < 2
+ return :E
+ elsif esize < factor
+ return ((bsize <= esize) ? :E :
+ ((bsize < factor) ? :B : :E))
+ elsif bsize < factor
+ return :B
+ else
+ return ((bsize <= esize) ? :B : :E)
+ end
+ end
+
+ def distribute_links(links_distribution_policy, segment_name,
+ copy_names, factor)
+ return if factor < 2
+ end_type = select_distribute_end(links_distribution_policy,
+ segment_name, factor)
+ return nil if end_type.nil?
+ et_links = links_of([segment_name, end_type])
+ diff = [et_links.size - factor, 0].max
+ links_signatures = et_links.map do |l|
+ l.other_end([segment_name, end_type]).join
+ end
+ ([segment_name]+copy_names).each_with_index do |sn, i|
+ links_of([sn, end_type]).each do |l|
+ l_sig = l.other_end([sn, end_type]).join
+ to_save = links_signatures[i..i+diff].to_a
+ delete_link(l) unless to_save.include?(l_sig)
+ end
+ end
+ end
+
+ def segment_and_segment_name(segment_or_segment_name)
+ if segment_or_segment_name.kind_of?(RGFA::Line)
+ s = segment_or_segment_name
+ sn = segment_or_segment_name.name
+ else
+ sn = segment_or_segment_name.to_sym
+ s = segment(sn)
+ end
+ return s, sn
+ end
+
+end
diff --git a/lib/rgfatools/p_bubbles.rb b/lib/rgfatools/p_bubbles.rb
new file mode 100644
index 0000000..af1f173
--- /dev/null
+++ b/lib/rgfatools/p_bubbles.rb
@@ -0,0 +1,66 @@
+#
+# Methods for the RGFA class, which involve a traversal of the graph following
+# links
+#
+module RGFATools::PBubbles
+
+ require "set"
+
+ # Removes all p-bubbles in the graph
+ # @return [RGFA] self
+ def remove_p_bubbles
+ visited = Set.new
+ segment_names.each do |sn|
+ next if visited.include?(sn)
+ if connectivity(sn) == [1,1]
+ s1 = neighbours([sn, :B])[0]
+ s2 = neighbours([sn, :E])[0]
+ n1 = neighbours(s1).sort
+ n2 = neighbours(s2).sort
+ n1.each {|se| visited << se[0].name}
+ if n1 == n2.map{|se| se.invert_end_type}
+ remove_proven_p_bubble(s1, s2, n1)
+ end
+ end
+ end
+ return self
+ end
+
+ # Removes a p-bubble between segment_end1 and segment_end2
+ # @param [RGFA::SegmentEnd] segment_end1 a segment end
+ # @param [RGFA::SegmentEnd] segment_end2 another segment end
+ # @!macro [new] count_tag
+ # @param count_tag [Symbol] <i>(defaults to: +:RC+ or the value set by
+ # {#set_default_count_tag})</i> the count tag to use for coverage
+ # computation
+ # @!macro [new] unit_length
+ # @param unit_length [Integer] <i>(defaults to: 1 or the value set by
+ # {#set_count_unit_length})</i> the unit length to use for coverage
+ # computation
+ # @return [RGFA] self
+ #
+ def remove_p_bubble(segment_end1, segment_end2,
+ count_tag: @default[:count_tag],
+ unit_length: @default[:unit_length])
+ n1 = neighbours(segment_end1).sort
+ n2 = neighbours(segment_end2).sort
+ raise if n1 != n2.map{|se| se.invert_end_type}
+ raise if n1.any? {|se| connectivity(se[0]) != [1,1]}
+ remove_proven_p_bubble(segment_end1, segment_end2, n1,
+ count_tag: count_tag,
+ unit_length: unit_length)
+ return self
+ end
+
+ private
+
+ def remove_proven_p_bubble(segment_end1, segment_end2, alternatives,
+ count_tag: @default[:count_tag],
+ unit_length: @default[:unit_length])
+ coverages = alternatives.map{|s|segment!(s[0]).coverage(
+ count_tag: count_tag, unit_length: unit_length)}
+ alternatives.delete_at(coverages.index(coverages.max))
+ alternatives.each {|s| delete_segment(s[0])}
+ end
+
+end
diff --git a/lib/rgfatools/superfluous_links.rb b/lib/rgfatools/superfluous_links.rb
new file mode 100644
index 0000000..8e256ac
--- /dev/null
+++ b/lib/rgfatools/superfluous_links.rb
@@ -0,0 +1,64 @@
+#
+# Methods which edit the graph components without traversal
+#
+module RGFATools::SuperfluousLinks
+
+ # Remove superfluous links in the presence of mandatory links
+ # for a single segment
+ # @return [RGFA] self
+ # @!macro segment_param
+ # @!macro [new] conserve_components_links
+ # @param [Boolean] conserve_components <i>(Defaults to: +true+)</i>
+ # delete links only if #cut_link?(link) is +false+ (see RGFA API).
+ def enforce_segment_mandatory_links(segment, conserve_components: true)
+ sn = segment_and_segment_name(segment)[1]
+ se = {}
+ l = {}
+ [:B, :E].each do |et|
+ se[et] = [sn, et]
+ l[et] = links_of(se[et])
+ end
+ cs = connectivity_symbols(l[:B].size, l[:E].size)
+ if cs == [1, 1]
+ oe = {}
+ [:B, :E].each {|et| oe[et] = l[et][0].other_end(se[et])}
+ return if oe[:B] == oe[:E]
+ [:B, :E].each {|et| delete_other_links(oe[et], se[et],
+ conserve_components: conserve_components)}
+ else
+ i = cs.index(1)
+ return if i.nil?
+ et = [:B, :E][i]
+ oe = l[et][0].other_end(se[et])
+ delete_other_links(oe, se[et], conserve_components: conserve_components)
+ end
+ self
+ end
+
+ # Remove superfluous links in the presence of mandatory links
+ # in the entire graph
+ # @!macro conserve_components_links
+ # @return [RGFA] self
+ def enforce_all_mandatory_links(conserve_components: true)
+ segment_names.each {|sn| enforce_segment_mandatory_links(sn,
+ conserve_components: conserve_components)}
+ self
+ end
+
+ # Remove links of segment to itself
+ # @!macro segment_param
+ # @return [RGFA] self
+ def remove_self_link(segment)
+ segment_name = segment.kind_of?(RGFA::Line) ? segment.name : segment
+ unconnect_segments(segment_name, segment_name)
+ self
+ end
+
+ # Remove all links of segments to themselves
+ # @return [RGFA] self
+ def remove_self_links
+ segment_names.each {|sn| remove_self_link(sn)}
+ self
+ end
+
+end
diff --git a/pdfdoc/cover.css b/pdfdoc/cover.css
new file mode 100644
index 0000000..52e0725
--- /dev/null
+++ b/pdfdoc/cover.css
@@ -0,0 +1,4 @@
+html, body{margin: 0; padding: 0; text-align: center}
+.page{box-sizing: border-box; height: 100%; width: 100%; border: 1px solid transparent; page-break-after: always;}
+.page-middle{height: 100%; width: 100%; display: table;}
+.page-middle-inner{height: 100%; width: 100%; display: table-cell; vertical-align: middle;}
diff --git a/pdfdoc/cover.html b/pdfdoc/cover.html
new file mode 100644
index 0000000..1a8d244
--- /dev/null
+++ b/pdfdoc/cover.html
@@ -0,0 +1,14 @@
+<html>
+<link rel="stylesheet" type="text/css" href="cover.css">
+<body>
+<div class="page">
+<div class="page-middle">
+<div class="page-middle-inner">
+ <h3>Giorgio Gonnella</h3>
+ <h1>RGFA library - API documentation</h1>
+ <h2>Version 1.1</h2>
+</div>
+</div>
+</div>
+</body>
+</html>
diff --git a/pdfdoc/print.css b/pdfdoc/print.css
new file mode 100644
index 0000000..2307667
--- /dev/null
+++ b/pdfdoc/print.css
@@ -0,0 +1,2 @@
+h1 {page-break-before: always;}
+h2 {page-break-after: avoid;}
diff --git a/rgfa.gemspec b/rgfa.gemspec
new file mode 100644
index 0000000..e15ecf3
--- /dev/null
+++ b/rgfa.gemspec
@@ -0,0 +1,68 @@
+Gem::Specification.new do |s|
+ s.name = 'rgfa'
+ s.version = '1.2'
+ s.date = '2016-09-20'
+ s.summary = 'Parse, edit and write GFA-format graphs in Ruby'
+ s.description = <<-EOF
+ The Graphical Fragment Assembly (GFA) is a proposed format which allow
+ to describe the product of sequence assembly.
+ This gem implements the proposed specifications for the GFA format
+ described under https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md
+ as close as possible.
+ The library allows to create an RGFA object from a file in the GFA format
+ or from scratch, to enumerate the graph elements (segments, links,
+ containments, paths and header lines), to traverse the graph (by
+ traversing all links outgoing from or incoming to a segment), to search for
+ elements (e.g. which links connect two segments) and to manipulate the
+ graph (e.g. to eliminate a link or a segment or to duplicate a segment
+ distributing the read counts evenly on the copies).
+ EOF
+ s.author = 'Giorgio Gonnella'
+ s.email = 'gonnella at zbh.uni-hamburg.de'
+ s.files = [
+ 'lib/rgfa.rb',
+ 'lib/rgfa/byte_array.rb',
+ 'lib/rgfa/cigar.rb',
+ 'lib/rgfa/connectivity.rb',
+ 'lib/rgfa/containments.rb',
+ 'lib/rgfa/error.rb',
+ 'lib/rgfa/field_array.rb',
+ 'lib/rgfa/field_writer.rb',
+ 'lib/rgfa/field_parser.rb',
+ 'lib/rgfa/field_validator.rb',
+ 'lib/rgfa/headers.rb',
+ 'lib/rgfa/line/containment.rb',
+ 'lib/rgfa/line/header.rb',
+ 'lib/rgfa/line/link.rb',
+ 'lib/rgfa/line/path.rb',
+ 'lib/rgfa/line/segment.rb',
+ 'lib/rgfa/line.rb',
+ 'lib/rgfa/linear_paths.rb',
+ 'lib/rgfa/lines.rb',
+ 'lib/rgfa/links.rb',
+ 'lib/rgfa/logger.rb',
+ 'lib/rgfa/multiplication.rb',
+ 'lib/rgfa/numeric_array.rb',
+ 'lib/rgfa/paths.rb',
+ 'lib/rgfa/rgl.rb',
+ 'lib/rgfa/segment_ends_path.rb',
+ 'lib/rgfa/segment_info.rb',
+ 'lib/rgfa/segments.rb',
+ 'lib/rgfa/sequence.rb',
+ 'lib/rgfatools.rb',
+ 'lib/rgfatools/artifacts.rb',
+ 'lib/rgfatools/copy_number.rb',
+ 'lib/rgfatools/invertible_segments.rb',
+ 'lib/rgfatools/multiplication.rb',
+ 'lib/rgfatools/superfluous_links.rb',
+ 'lib/rgfatools/linear_paths.rb',
+ 'lib/rgfatools/p_bubbles.rb',
+ 'bin/gfadiff.rb',
+ 'bin/rgfa-mergelinear.rb',
+ 'bin/rgfa-simdebruijn.rb',
+ 'bin/rgfa-findcrisprs.rb',
+ ]
+ s.homepage = 'http://github.com/ggonnella/rgfa'
+ s.license = 'CC-BY-SA'
+ s.required_ruby_version = '>= 2.0'
+end
diff --git a/test/test_rgfa.rb b/test/test_rgfa.rb
new file mode 100644
index 0000000..3aa9785
--- /dev/null
+++ b/test/test_rgfa.rb
@@ -0,0 +1,101 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+require "tempfile"
+
+class TestRGFA < Test::Unit::TestCase
+
+ def test_initialize
+ assert_nothing_raised { RGFA.new }
+ gfa = RGFA.new
+ assert_equal(RGFA, gfa.class)
+ end
+
+ def test_segment_names
+ gfa = RGFA.new
+ assert_equal([], gfa.segment_names)
+ gfa << "S\t1\t*"
+ gfa << "S\t2\t*"
+ assert_equal([:"1", :"2"], gfa.segment_names)
+ gfa.delete_segment("1")
+ assert_equal([:"2"], gfa.segment_names)
+ end
+
+ def test_path_names
+ gfa = RGFA.new
+ assert_equal([], gfa.path_names)
+ gfa << "P\t3\t1+,4-\t*"
+ assert_equal([:"3"], gfa.path_names)
+ gfa.delete_path("3")
+ assert_equal([], gfa.path_names)
+ end
+
+ def test_validate!
+ gfa = RGFA.new
+ gfa << "S\t1\t*"
+ assert_nothing_raised { gfa.validate! }
+ gfa << "L\t1\t+\t2\t-\t*"
+ assert_raise(RGFA::LineMissingError) { gfa.validate! }
+ gfa << "S\t2\t*"
+ assert_nothing_raised { gfa.validate! }
+ gfa << "P\t3\t1+,4-\t*"
+ assert_raise(RGFA::LineMissingError) { gfa.validate! }
+ gfa << "S\t4\t*"
+ assert_raise(RGFA::LineMissingError) { gfa.validate! }
+ gfa << "L\t4\t+\t1\t-\t*"
+ assert_nothing_raised { gfa.validate! }
+ end
+
+ def test_to_s
+ lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*",
+ "L\t1\t+\t2\t-\t*","C\t1\t+\t3\t-\t12\t*","P\t4\t1+,2-\t*"]
+ gfa = RGFA.new
+ lines.each {|l| gfa << l}
+ assert_equal(lines.join("\n")+"\n", gfa.to_s)
+ end
+
+ def test_to_rgfa
+ gfa = RGFA.new
+ gfa2 = gfa.to_rgfa
+ assert(gfa2)
+ assert_equal(RGFA, gfa2.class)
+ end
+
+ def test_from_file
+ filename = "test/testdata/example1.gfa"
+ gfa = RGFA.from_file(filename)
+ assert(gfa)
+ assert_equal(IO.read(filename), gfa.to_s)
+ end
+
+ def test_to_file
+ filename = "test/testdata/example1.gfa"
+ gfa = RGFA.from_file(filename)
+ tmp = Tempfile.new("example1")
+ gfa.to_file(tmp.path)
+ tmp.rewind
+ assert_equal(IO.read(filename), IO.read(tmp))
+ end
+
+ def test_string_to_rgfa
+ lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*",
+ "L\t1\t+\t2\t-\t*","C\t1\t+\t3\t-\t12\t*","P\t4\t1+,2-\t*"]
+ gfa1 = RGFA.new
+ lines.each {|l| gfa1 << l}
+ gfa2 = lines.join("\n").to_rgfa
+ assert(gfa2)
+ assert_equal(RGFA, gfa2.class)
+ assert_equal(gfa1.to_s, gfa2.to_s)
+ end
+
+ def test_array_to_rgfa
+ lines = ["H\tVN:Z:1.0","S\t1\t*","S\t2\t*","S\t3\t*",
+ "L\t1\t+\t2\t-\t*","C\t1\t+\t3\t-\t12\t*","P\t4\t1+,2-\t*"]
+ gfa1 = RGFA.new
+ lines.each {|l| gfa1 << l}
+ gfa2 = lines.to_rgfa
+ assert(gfa2)
+ assert_equal(RGFA, gfa2.class)
+ assert_equal(gfa1.to_s, gfa2.to_s)
+ end
+
+end
diff --git a/test/test_rgfa_byte_array.rb b/test/test_rgfa_byte_array.rb
new file mode 100644
index 0000000..a96161f
--- /dev/null
+++ b/test/test_rgfa_byte_array.rb
@@ -0,0 +1,41 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFAByteArray < Test::Unit::TestCase
+
+ def test_byte_array_creation
+ a, b = nil
+ assert_nothing_raised { a = RGFA::ByteArray.new([1,2,3,4,5]) }
+ assert_nothing_raised { b = [1,2,3,4,5].to_byte_array }
+ assert_equal(a, b)
+ end
+
+ def test_byte_array_validation
+ a = nil
+ assert_nothing_raised { a = RGFA::ByteArray.new([1,2,3,4,5]) }
+ assert_nothing_raised { a.validate! }
+ assert_nothing_raised { a = RGFA::ByteArray.new([1,2,3,4,356]) }
+ assert_raises(RGFA::ByteArray::ValueError) { a.validate! }
+ end
+
+ def test_from_string
+ a = nil
+ assert_nothing_raised { a = "12ACF4AA601C1F".to_byte_array }
+ b = [18, 172, 244, 170, 96, 28, 31].to_byte_array
+ assert_equal(b, a)
+ assert_raises(RGFA::ByteArray::FormatError) {
+ a = "12ACF4AA601C1".to_byte_array }
+ assert_raises(RGFA::ByteArray::FormatError) {
+ a = "".to_byte_array }
+ assert_raises(ArgumentError) { a = "12ACG4AA601C1F".to_byte_array }
+ end
+
+ def test_to_string
+ a = [18, 172, 244, 170, 96, 28, 31].to_byte_array
+ b = "12ACF4AA601C1F"
+ assert_equal(b, a.to_s)
+ a = [18, 172, 280, 170, 96, 28, 31].to_byte_array
+ assert_raises(RGFA::ByteArray::ValueError) { a.to_s }
+ end
+
+end
diff --git a/test/test_rgfa_cigar.rb b/test/test_rgfa_cigar.rb
new file mode 100644
index 0000000..973b50b
--- /dev/null
+++ b/test/test_rgfa_cigar.rb
@@ -0,0 +1,33 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFACigar < Test::Unit::TestCase
+
+ def test_from_string_nonempty
+ assert_equal(RGFA::CIGAR.new([
+ RGFA::CIGAR::Operation.new(12,:M),
+ RGFA::CIGAR::Operation.new(1,:D),
+ RGFA::CIGAR::Operation.new(2,:I)]),"12M1D2I".to_cigar)
+ end
+
+ def test_from_string_empty
+ assert_equal([],"*".to_cigar)
+ end
+
+ def test_from_string_invalid
+ assert_raises(RGFA::CIGAR::ValueError){"12x1D2I".to_cigar}
+ end
+
+ def test_to_s_nonempty
+ assert_equal("12M1D2I",
+ RGFA::CIGAR.new([
+ RGFA::CIGAR::Operation.new(12,:M),
+ RGFA::CIGAR::Operation.new(1,:D),
+ RGFA::CIGAR::Operation.new(2,:I)]).to_s)
+ end
+
+ def test_to_s_empty
+ assert_equal("*", RGFA::CIGAR.new([]).to_s)
+ end
+
+end
diff --git a/test/test_rgfa_edit.rb b/test/test_rgfa_edit.rb
new file mode 100644
index 0000000..bd83dea
--- /dev/null
+++ b/test/test_rgfa_edit.rb
@@ -0,0 +1,96 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFAEdit < Test::Unit::TestCase
+
+ def test_rename
+ gfa = ["S\t0\t*", "S\t1\t*", "S\t2\t*", "L\t0\t+\t2\t-\t12M",
+ "C\t1\t+\t0\t+\t12\t12M", "P\t4\t2+,0-\t12M"].to_rgfa
+ gfa.rename("0", "X")
+ assert_equal([:"X", :"1", :"2"].sort, gfa.segment_names.sort)
+ assert_equal("L\tX\t+\t2\t-\t12M", gfa.links[0].to_s)
+ assert_equal("C\t1\t+\tX\t+\t12\t12M", gfa.containments[0].to_s)
+ assert_equal("P\t4\t2+,X-\t12M", gfa.paths[0].to_s)
+ assert_raises(RGFA::LineMissingError){gfa.links_of(["0", :E])}
+ assert_equal("L\tX\t+\t2\t-\t12M", gfa.links_of(["X", :E])[0].to_s)
+ assert_equal("C\t1\t+\tX\t+\t12\t12M", gfa.contained_in("1")[0].to_s)
+ assert_raises(RGFA::LineMissingError){gfa.containing("0")}
+ assert_equal("C\t1\t+\tX\t+\t12\t12M", gfa.containing("X")[0].to_s)
+ assert_raises(RGFA::LineMissingError){gfa.paths_with("0")}
+ assert_equal("P\t4\t2+,X-\t12M", gfa.paths_with("X")[0].to_s)
+ end
+
+ def test_multiply_segment
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ s = ["S\t0\t*\tRC:i:600",
+ "S\t1\t*\tRC:i:6000",
+ "S\t2\t*\tRC:i:60000"]
+ l = "L\t1\t+\t2\t+\t12M"
+ c = "C\t1\t+\t0\t+\t12\t12M"
+ p = "P\t3\t2+,0-\t12M"
+ (s + [l,c,p]).each {|line| gfa << line }
+ assert_equal(s, gfa.segments.map(&:to_s))
+ assert_equal([l], gfa.links.select{|n|!n.virtual?}.map(&:to_s))
+ assert_equal([c], gfa.containments.map(&:to_s))
+ assert_equal(l, gfa.link(["1", :E], ["2", :B]).to_s)
+ assert_equal(c, gfa.containment("1", "0").to_s)
+ assert_raises(RGFA::LineMissingError){gfa.link(["1a", :E], ["2", :B])}
+ assert_raises(RGFA::LineMissingError){gfa.containment("5", "0")}
+ assert_equal(6000, gfa.segment("1").RC)
+ gfa.multiply("1", 2)
+ assert_equal(l, gfa.link(["1", :E], ["2", :B]).to_s)
+ assert_equal(c, gfa.containment("1", "0").to_s)
+ assert_not_equal(nil, gfa.link(["1b", :E], ["2", :B]))
+ assert_not_equal(nil, gfa.containment("1b", "0"))
+ assert_equal(3000, gfa.segment("1").RC)
+ assert_equal(3000, gfa.segment("1b").RC)
+ gfa.multiply("1b", 3 , copy_names:["6","7"])
+ assert_equal(l, gfa.link(["1", :E], ["2", :B]).to_s)
+ assert_not_equal(nil, gfa.link(["1b", :E], ["2", :B]))
+ assert_not_equal(nil, gfa.link(["6", :E], ["2", :B]))
+ assert_not_equal(nil, gfa.link(["7", :E], ["2", :B]))
+ assert_not_equal(nil, gfa.containment("1b", "0"))
+ assert_not_equal(nil, gfa.containment("6", "0"))
+ assert_not_equal(nil, gfa.containment("7", "0"))
+ assert_equal(3000, gfa.segment("1").RC)
+ assert_equal(1000, gfa.segment("1b").RC)
+ assert_equal(1000, gfa.segment("6").RC)
+ assert_equal(1000, gfa.segment("7").RC)
+ end
+
+ def test_multiply_segment_copy_names
+ gfa = ["H\tVN:Z:1.0",
+ "S\t1\t*\tRC:i:600",
+ "S\t1b\t*\tRC:i:6000",
+ "S\t2\t*\tRC:i:60000",
+ "S\t3\t*\tRC:i:60000"].to_rgfa
+ gfa.multiply("2", 2, copy_names: :upcase)
+ assert_nothing_raised {gfa.segment!("2B")}
+ gfa.multiply("2", 2, copy_names: :upcase)
+ assert_nothing_raised {gfa.segment!("2C")}
+ gfa.multiply("2", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("2_copy")}
+ gfa.multiply("2", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("2_copy2")}
+ gfa.multiply("2", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("2_copy3")}
+ gfa.multiply("2_copy", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("2_copy4")}
+ gfa.multiply("2_copy4", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("2_copy5")}
+ gfa.multiply("2", 2, copy_names: :number)
+ assert_nothing_raised {gfa.segment!("4")}
+ gfa.multiply("1b", 2)
+ assert_nothing_raised {gfa.segment!("1c")}
+ gfa.multiply("1b", 2, copy_names: :number)
+ assert_nothing_raised {gfa.segment!("1b2")}
+ gfa.multiply("1b", 2, copy_names: :copy)
+ assert_nothing_raised {gfa.segment!("1b_copy")}
+ gfa.multiply("1b_copy", 2, copy_names: :lowcase)
+ assert_nothing_raised {gfa.segment!("1b_copz")}
+ gfa.multiply("1b_copy", 2, copy_names: :upcase)
+ assert_nothing_raised {gfa.segment!("1b_copyB")}
+ end
+
+end
diff --git a/test/test_rgfa_field_parser.rb b/test/test_rgfa_field_parser.rb
new file mode 100644
index 0000000..f9decda
--- /dev/null
+++ b/test/test_rgfa_field_parser.rb
@@ -0,0 +1,55 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFAFieldParser < Test::Unit::TestCase
+
+ def test_parse_gfa_opfield
+ o = "AA:i:1"
+ assert_equal([:AA,:i,"1"], o.parse_gfa_optfield)
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "1A:A:A".parse_gfa_optfield
+ end
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "_A:A:A".parse_gfa_optfield
+ end
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "A:A:A".parse_gfa_optfield
+ end
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "AAA:A:A".parse_gfa_optfield
+ end
+ assert_raise(RGFA::FieldParser::FormatError) {"AA:C:1".parse_gfa_optfield}
+ assert_raise(RGFA::FieldParser::FormatError) {"AA:AA:1".parse_gfa_optfield}
+ assert_raise(RGFA::FieldParser::FormatError) {"AA:a:1".parse_gfa_optfield}
+ end
+
+ def test_parse_gfa_field_A
+ assert_equal("1", "1".parse_gfa_field(datatype: :A))
+ end
+
+ def test_parse_gfa_field_i
+ assert_equal(12, "12".parse_gfa_field(datatype: :i))
+ end
+
+ def test_parse_gfa_field_f
+ assert_equal(1.2, "1.2".parse_gfa_field(datatype: :f))
+ end
+
+ def test_parse_gfa_field_Z
+ assert_equal("1.2", "1.2".parse_gfa_field(datatype: :Z))
+ end
+
+ def test_parse_gfa_field_H
+ assert_equal([26], "1A".parse_gfa_field(datatype: :H))
+ end
+
+ def test_parse_gfa_field_B
+ assert_equal([12,12,12], "c,12,12,12".parse_gfa_field(datatype: :B))
+ assert_equal([1.2,1.2,1.2], "f,1.2,1.2,1.2".parse_gfa_field(datatype: :B))
+ end
+
+ def test_parse_gfa_field_J
+ assert_equal({"1" => 2}, "{\"1\":2}".parse_gfa_field(datatype: :J))
+ end
+
+end
diff --git a/test/test_rgfa_field_validator.rb b/test/test_rgfa_field_validator.rb
new file mode 100644
index 0000000..ec021f8
--- /dev/null
+++ b/test/test_rgfa_field_validator.rb
@@ -0,0 +1,56 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFAFieldValidator < Test::Unit::TestCase
+
+ def test_field_gfa_field_validate_i
+ assert_nothing_raised { "1".validate_gfa_field!(:i) }
+ assert_nothing_raised { "12".validate_gfa_field!(:i) }
+ assert_nothing_raised { "-12".validate_gfa_field!(:i) }
+ assert_raise(RGFA::FieldParser::FormatError) {"1A".validate_gfa_field!(:i)}
+ assert_raise(RGFA::FieldParser::FormatError) {"A1".validate_gfa_field!(:i)}
+ assert_raise(RGFA::FieldParser::FormatError) {"2.1".validate_gfa_field!(:i)}
+ end
+
+ def test_field_gfa_field_validate_A
+ assert_nothing_raised { "A".validate_gfa_field!(:A) }
+ assert_raise(RGFA::FieldParser::FormatError) {"AA".validate_gfa_field!(:A)}
+ end
+
+ def test_field_gfa_field_validate_f
+ assert_nothing_raised { "-12.1".validate_gfa_field!(:f) }
+ assert_nothing_raised { "-12.1E-2".validate_gfa_field!(:f) }
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "2.1X".validate_gfa_field!(:f)
+ end
+ end
+
+ def test_field_gfa_field_validate_Z
+ assert_nothing_raised { "-12.1E-2".validate_gfa_field!(:Z) }
+ end
+
+ def test_field_gfa_field_validate_H
+ assert_nothing_raised { "0A12121EFF".validate_gfa_field!(:H) }
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "21X1".validate_gfa_field!(:H)
+ end
+ end
+
+ def test_field_gfa_field_validate_B
+ assert_nothing_raised { "i,12,-5".validate_gfa_field!(:B) }
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "C,X1".validate_gfa_field!(:B)
+ end
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "f.1.1".validate_gfa_field!(:B)
+ end
+ end
+
+ def test_field_gfa_field_validate_J
+ assert_nothing_raised {"{\"1\":2}".validate_gfa_field!(:J) }
+ assert_raise(RGFA::FieldParser::FormatError) do
+ "1\t2".validate_gfa_field!(:J)
+ end
+ end
+
+end
diff --git a/test/test_rgfa_field_writer.rb b/test/test_rgfa_field_writer.rb
new file mode 100644
index 0000000..da8c421
--- /dev/null
+++ b/test/test_rgfa_field_writer.rb
@@ -0,0 +1,45 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFAFieldWriter < Test::Unit::TestCase
+
+ def test_field_writer_i
+ assert_equal("13", 13.to_gfa_field)
+ end
+
+ def test_field_writer_f
+ assert_equal("1.3", 1.3.to_gfa_field)
+ end
+
+ def test_field_writer_Z
+ assert_equal("1B", "1B".to_gfa_field)
+ end
+
+ def test_field_writer_H
+ assert_equal("0D0D0D", [13,13,13].to_byte_array.to_gfa_field)
+ assert_raise(RGFA::ByteArray::ValueError) do
+ [13,13,1.3].to_byte_array.to_gfa_field
+ end
+ assert_raise(RGFA::ByteArray::ValueError) do
+ [13,13,350].to_byte_array.to_gfa_field
+ end
+ end
+
+ def test_field_writer_B
+ assert_equal("C,13,13,13", [13,13,13].to_gfa_field)
+ assert_equal("f,1.3,1.3,1.3", [1.3,1.3,1.3].to_gfa_field)
+ assert_raise(RGFA::NumericArray::ValueError) do
+ [13,1.3,1.3].to_gfa_field(datatype: :B)
+ end
+ end
+
+ def test_field_writer_J
+ assert_equal("[\"A\",12]", ["A", 12].to_gfa_field)
+ assert_equal("{\"A\":12}", {"A" => 12}.to_gfa_field)
+ end
+
+ def test_field_writer_as_optfield
+ assert_equal("AA:i:13", 13.to_gfa_optfield(:AA))
+ end
+
+end
diff --git a/test/test_rgfa_line.rb b/test/test_rgfa_line.rb
new file mode 100644
index 0000000..c144ff4
--- /dev/null
+++ b/test/test_rgfa_line.rb
@@ -0,0 +1,199 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALine < Test::Unit::TestCase
+
+ def test_initialize_not_enough_required
+ assert_nothing_raised do
+ RGFA::Line::Segment.new(["1","*"])
+ end
+ assert_raise(RGFA::Line::RequiredFieldMissingError) do
+ RGFA::Line::Segment.new(["1"])
+ end
+ end
+
+ def test_initialize_too_many_required
+ assert_raise(RGFA::FieldParser::FormatError) do
+ RGFA::Line::Segment.new(["1","*","*"])
+ end
+ end
+
+ def test_initialize_predefined_optfield_wrong_type
+ assert_nothing_raised do
+ RGFA::Line::Header.new(["VN:Z:1"])
+ end
+ assert_raise(RGFA::Line::PredefinedOptfieldTypeError) do
+ RGFA::Line::Header.new(["VN:i:1"])
+ end
+ end
+
+ def test_initialize_wrong_optfield_format
+ assert_raise(RGFA::FieldParser::FormatError) do
+ RGFA::Line::Header.new(["VN i:1"])
+ end
+ end
+
+ def test_initialize_reqfield_type_error
+ assert_raise(RGFA::FieldParser::FormatError) do
+ RGFA::Line::Segment.new(["1\t1","*","*"])
+ end
+ end
+
+ def test_initialize_optfield_type_error
+ assert_raise(ArgumentError) do
+ RGFA::Line::Header.new(["zz:i:1A"])
+ end
+ end
+
+ def test_initialize_duplicate_optfield
+ assert_raise(RGFA::Line::DuplicatedOptfieldNameError) do
+ RGFA::Line::Header.new(["zz:i:1","zz:i:2"])
+ end
+ assert_raise(RGFA::Line::DuplicatedOptfieldNameError) do
+ RGFA::Line::Header.new(["zz:i:1", "VN:Z:1", "zz:i:2"])
+ end
+ end
+
+ def test_initialize_custom_optfield
+ assert_raise(RGFA::Line::CustomOptfieldNameError) do
+ RGFA::Line::Header.new(["ZZ:Z:1"])
+ end
+ end
+
+ def test_clone
+ l = "H\tVN:Z:1.0".to_rgfa_line
+ l1 = l
+ l2 = l.clone
+ assert_equal(RGFA::Line::Header, l.class)
+ assert_equal(RGFA::Line::Header, l2.class)
+ l2.VN="2.0"
+ assert_equal("2.0", l2.VN)
+ assert_equal("1.0", l.VN)
+ l1.VN="2.0"
+ assert_equal("2.0", l.VN)
+ end
+
+ def test_respond_to
+ l = RGFA::Line::Link.new(["1","+","2","-","*","zz:Z:yes","KC:i:100"])
+ # record_type
+ assert(l.respond_to?(:record_type))
+ # reqfields
+ assert(l.respond_to?(:from))
+ assert(l.respond_to?(:from=))
+ # predefined optfields
+ assert(l.respond_to?(:KC))
+ assert(l.respond_to?(:KC!))
+ assert(l.respond_to?(:KC=))
+ # custom optfields
+ assert(l.respond_to?(:zz))
+ assert(l.respond_to?(:zz!))
+ assert(l.respond_to?(:zz=))
+ # not-yet-existing optfields
+ assert(l.respond_to?(:aa))
+ assert(l.respond_to?(:aa!))
+ assert(l.respond_to?(:aa=))
+ end
+
+ def test_record_type
+ l = RGFA::Line::Header.new(["xx:i:13","VN:Z:HI"])
+ assert_equal(:H, l.record_type)
+ assert_raise(NoMethodError) { l.record_type = "S" }
+ end
+
+ def test_field_getters_required_fields
+ l = RGFA::Line::Segment.new(["12","*","xx:i:13","KC:i:10"])
+ assert_equal(:"12", l.name)
+ assert_raise(NoMethodError) { l.zzz }
+ end
+
+ def test_field_getters_existing_optional_fields
+ l = RGFA::Line::Segment.new(["12","*","xx:i:13","KC:i:10"])
+ assert_equal(:xx, l.fieldnames[2])
+ assert_equal(:xx, l.optional_fieldnames[0])
+ assert_equal("13", l.field_to_s(:xx))
+ assert_equal(13, l.xx)
+ assert_equal(13, l.xx!)
+ assert_equal("10", l.field_to_s(:KC))
+ assert_equal(10, l.KC)
+ assert_equal(10, l.KC!)
+ end
+
+ def test_field_getters_not_existing_optional_fields
+ l = RGFA::Line::Header.new(["xx:i:13","VN:Z:HI"])
+ assert_equal(nil, l.zz)
+ assert_raise(RGFA::Line::TagMissingError) { l.zz! }
+ end
+
+ def test_field_setters_required_fields
+ l = RGFA::Line::Segment.new(["12","*","xx:i:13","KC:i:1200"])
+ assert_raise(RGFA::FieldParser::FormatError) { l.name = "A\t1";
+ l.validate_field!(:name) }
+ l.name = "14"
+ assert_equal(:"14", l.name)
+ end
+
+ def test_field_setters_existing_optional_fields
+ l = RGFA::Line::Header.new(["xx:i:13","VN:Z:HI"], validate: 5)
+ assert_equal(13, l.xx)
+ l.xx = 15
+ assert_equal(15, l.xx)
+ assert_raise(RGFA::FieldParser::FormatError) { l.xx = "1A" }
+ assert_nothing_raised { l.set_datatype(:xx, :Z); l.xx = "1A" }
+ assert_equal("HI", l.VN)
+ l.VN = "HO"
+ assert_equal("HO", l.VN)
+ end
+
+ def test_field_setters_not_existing_optional_fields
+ l = RGFA::Line::Header.new(["xx:i:13","VN:Z:HI"])
+ assert_nothing_raised { l.zz="1" }
+ assert_equal("1", l.zz)
+ assert_equal(:"Z", l.zz.default_gfa_datatype)
+ assert_nothing_raised { l.zi=1 }
+ assert_equal(1, l.zi)
+ assert_equal(:"i", l.zi.default_gfa_datatype)
+ assert_nothing_raised { l.zf=1.0 }
+ assert_equal(1.0, l.zf)
+ assert_equal(:"f", l.zf.default_gfa_datatype)
+ assert_nothing_raised { l.bf=[1.0,1.0] }
+ assert_equal([1.0,1.0], l.bf)
+ assert_equal(:"B", l.bf.default_gfa_datatype)
+ assert_nothing_raised { l.bi=[1.0,1.0] }
+ assert_equal([1,1], l.bi)
+ assert_equal(:"B", l.bi.default_gfa_datatype)
+ assert_nothing_raised { l.ba=[1.0,1] }
+ assert_equal([1.0,1], l.ba)
+ assert_equal(:"J", l.ba.default_gfa_datatype)
+ assert_nothing_raised { l.bh={:a => 1.0, :b => 1} }
+ assert_equal({"a"=>1.0,"b"=>1}, l.to_s.to_rgfa_line.bh)
+ assert_equal(:"J", l.bh.default_gfa_datatype)
+ assert_raise(NoMethodError) { l.zzz="1" }
+ end
+
+ def test_add_optfield
+ l = RGFA::Line::Header.new(["xx:i:13","VN:Z:HI"])
+ assert_equal(nil, l.xy)
+ l.set(:xy, "HI")
+ assert_equal("HI", l.xy)
+ end
+
+ def test_to_s
+ fields = ["xx:i:13","VN:Z:HI"]
+ l = RGFA::Line::Header.new(fields.clone)
+ assert_equal((["H"]+fields).join("\t"),l.to_s)
+ end
+
+ def test_unknown_record_type
+ assert_raise(RGFA::Line::UnknownRecordTypeError) {"Z\txxx".to_rgfa_line}
+ end
+
+ def test_to_rgfa_line
+ str = "H\tVN:Z:1.0"
+ l = str.to_rgfa_line
+ assert_equal(RGFA::Line::Header, l.class)
+ assert_equal(RGFA::Line::Header, l.to_rgfa_line.class)
+ assert_equal(str, l.to_rgfa_line.to_s)
+ assert_equal(l, l.to_rgfa_line)
+ end
+
+end
diff --git a/test/test_rgfa_line_containment.rb b/test/test_rgfa_line_containment.rb
new file mode 100644
index 0000000..55d6415
--- /dev/null
+++ b/test/test_rgfa_line_containment.rb
@@ -0,0 +1,43 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineContainment < Test::Unit::TestCase
+
+ def test_from_string
+ fields=["C","1","+","2","-","12","12M","MQ:i:1232","NM:i:3","ab:Z:abcd"]
+ str=fields.join("\t")
+ assert_nothing_raised { str.to_rgfa_line }
+ assert_equal(RGFA::Line::Containment, str.to_rgfa_line.class)
+ assert_equal(fields[0].to_sym, str.to_rgfa_line.record_type)
+ assert_equal(fields[1].to_sym, str.to_rgfa_line.from)
+ assert_equal(fields[2].to_sym, str.to_rgfa_line.from_orient)
+ assert_equal(fields[3].to_sym, str.to_rgfa_line.to)
+ assert_equal(fields[4].to_sym, str.to_rgfa_line.to_orient)
+ assert_equal(12, str.to_rgfa_line.pos)
+ assert_equal([RGFA::CIGAR::Operation.new(12,:M)], str.to_rgfa_line.overlap)
+ assert_equal(1232, str.to_rgfa_line.MQ)
+ assert_equal(3, str.to_rgfa_line.NM)
+ assert_equal("abcd", str.to_rgfa_line.ab)
+ assert_raises(RGFA::FieldParser::FormatError) { (str+"\tH1").to_rgfa_line }
+ assert_raises(RGFA::Line::RequiredFieldMissingError) { "C\tH".to_rgfa_line }
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[2]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[4]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(ArgumentError) do
+ f=fields.dup; f[5]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::CIGAR::ValueError) do
+ f=fields.dup; f[6]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ f=fields.dup; f[7]="MQ:Z:1232"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ f=fields.dup; f[8]="NM:Z:1232"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ end
+
+end
diff --git a/test/test_rgfa_line_creators.rb b/test/test_rgfa_line_creators.rb
new file mode 100644
index 0000000..4fd61f6
--- /dev/null
+++ b/test/test_rgfa_line_creators.rb
@@ -0,0 +1,143 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineCreators < Test::Unit::TestCase
+
+ def test_add_headers
+ gfa = RGFA.new
+ h = "H\tVN:Z:1.0"
+ assert_nothing_raised { gfa << h }
+ assert_equal([h], gfa.headers.map(&:to_s))
+ end
+
+ def test_add_segments
+ gfa = RGFA.new
+ s1 = "S\t1\t*".to_rgfa_line
+ s2 = "S\t2\t*".to_rgfa_line
+ assert_nothing_raised { gfa << s1 }
+ assert_nothing_raised { gfa << s2 }
+ assert_equal([s1, s2], gfa.segments)
+ assert_equal([:"1", :"2"], gfa.segment_names)
+ assert_equal(s1, gfa.segment("1"))
+ assert_equal(nil, gfa.segment("0"))
+ assert_nothing_raised { gfa.segment!("1") }
+ assert_raises(RGFA::LineMissingError) { gfa.segment!("0") }
+ assert_raises(RGFA::DuplicatedLabelError) { gfa << s2 }
+ end
+
+ def test_add_links
+ s1 = "S\t1\t*"
+ s2 = "S\t2\t*"
+ l1 = "L\t1\t+\t2\t+\t12M".to_rgfa_line
+ l2 = "L\t1\t+\t3\t+\t12M"
+ gfa = RGFA.new
+ gfa << s1
+ gfa << s2
+ assert_nothing_raised { gfa << l1 }
+ assert_equal([l1], gfa.links)
+ assert_equal(l1, gfa.link(["1", :E], ["2", :B]))
+ assert_equal(l1, gfa.link(["2", :B], ["1", :E]))
+ assert_equal(nil, gfa.link(["2", :E], ["1", :B]))
+ assert_nothing_raised {gfa.link!(["1", :E], ["2", :B])}
+ assert_raises(RGFA::LineMissingError) {gfa.link!(["2", :E], ["1", :B])}
+ assert_nothing_raised { gfa << l2 }
+ end
+
+ def test_add_containments
+ s1 = "S\t1\t*"
+ s2 = "S\t2\t*"
+ c1 = "C\t1\t+\t2\t+\t12\t12M".to_rgfa_line
+ c2 = "C\t1\t+\t3\t+\t12\t12M"
+ gfa = RGFA.new
+ gfa << s1
+ gfa << s2
+ assert_nothing_raised { gfa << c1 }
+ assert_equal([c1], gfa.containments)
+ assert_equal(c1, gfa.containment("1", "2"))
+ assert_nothing_raised {gfa.containment!("1", "2")}
+ assert_raises(RGFA::LineMissingError) {gfa.containment!("2", "1")}
+ assert_nothing_raised { gfa << c2 }
+ end
+
+ def test_add_paths
+ s1 = "S\t1\t*"
+ s2 = "S\t2\t*"
+ p1 = "P\t4\t1+,2+\t122M".to_rgfa_line
+ p2 = "P\t1\t1+,2+\t122M"
+ p3 = "P\t5\t1+,2+,3+\t122M,120M"
+ gfa = RGFA.new
+ gfa << s1
+ gfa << s2
+ assert_nothing_raised { gfa << p1 }
+ assert_equal([p1], gfa.paths)
+ assert_equal([:"4"], gfa.path_names)
+ assert_equal(p1, gfa.path("4"))
+ assert_equal(nil, gfa.path("5"))
+ assert_nothing_raised {gfa.path!("4")}
+ assert_raises(RGFA::LineMissingError) {gfa.path!("5")}
+ assert_raises(RGFA::DuplicatedLabelError) { gfa << p2 }
+ assert_nothing_raised { gfa << p3 }
+ end
+
+ def test_segments_first_order
+ s1 = "S\t1\t*"
+ s2 = "S\t2\t*"
+ l1 = "L\t1\t+\t2\t+\t122M"
+ l2 = "L\t1\t+\t3\t+\t122M"
+ c1 = "C\t1\t+\t2\t+\t12\t12M"
+ c2 = "C\t1\t+\t3\t+\t12\t12M"
+ p1 = "P\t4\t1+,2+\t122M"
+ p2 = "P\t1\t1+,2+\t122M"
+ p3 = "P\t5\t1+,3+\t122M"
+ gfa = RGFA.new
+ gfa.require_segments_first_order
+ gfa << s1
+ gfa << s2
+ assert_nothing_raised { gfa << l1 }
+ assert_raises(RGFA::LineMissingError) { gfa << l2 }
+ assert_nothing_raised { gfa << c1 }
+ assert_raises(RGFA::LineMissingError) { gfa << c2 }
+ assert_nothing_raised { gfa << p1 }
+ assert_raises(RGFA::DuplicatedLabelError) { gfa << p2 }
+ assert_raises(RGFA::LineMissingError) { gfa << p3 }
+ end
+
+ def test_header_add
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ gfa << "H\taa:i:12\tab:Z:test1"
+ gfa << "H\tac:Z:test2"
+ gfa.header.add(:aa, 15)
+ assert_equal(
+ [
+ "H\tVN:Z:1.0",
+ "H\taa:i:12",
+ "H\taa:i:15",
+ "H\tab:Z:test1",
+ "H\tac:Z:test2",
+ ],
+ gfa.headers.map(&:to_s).sort)
+ gfa.header.add(:aa, 16)
+ assert_equal(
+ [
+ "H\tVN:Z:1.0",
+ "H\taa:i:12",
+ "H\taa:i:15",
+ "H\taa:i:16",
+ "H\tab:Z:test1",
+ "H\tac:Z:test2",
+ ],
+ gfa.headers.map(&:to_s).sort)
+ gfa.header.delete(:aa)
+ gfa.header.aa = 26
+ assert_equal(
+ [
+ "H\tVN:Z:1.0",
+ "H\taa:i:26",
+ "H\tab:Z:test1",
+ "H\tac:Z:test2",
+ ],
+ gfa.headers.map(&:to_s).sort)
+ end
+
+end
diff --git a/test/test_rgfa_line_destructors.rb b/test/test_rgfa_line_destructors.rb
new file mode 100644
index 0000000..0d4b960
--- /dev/null
+++ b/test/test_rgfa_line_destructors.rb
@@ -0,0 +1,93 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineDestructors < Test::Unit::TestCase
+
+ def test_delete_headers
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ assert_equal(["H\tVN:Z:1.0"], gfa.headers.map(&:to_s))
+ gfa.delete_headers
+ assert_equal([], gfa.headers)
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ gfa.rm(:headers)
+ assert_equal([], gfa.headers)
+ end
+
+ def test_delete_links
+ gfa = RGFA.new
+ s = ["S\t0\t*", "S\t1\t*", "S\t2\t*"]
+ l = "L\t1\t+\t2\t+\t12M"
+ c = "C\t1\t+\t0\t+\t12\t12M"
+ (s + [l,c]).each {|line| gfa << line }
+ assert_equal([l], gfa.links.map(&:to_s))
+ assert_equal(l, gfa.link(["1", :E], ["2", :B]).to_s)
+ gfa.delete_link(gfa.link_from_to(["1", "+"], ["2", "+"]))
+ assert_equal([], gfa.links)
+ assert_equal(nil, gfa.link(["1", :E], ["2", :B]))
+ assert_equal([c], gfa.containments.map(&:to_s))
+ assert_equal(c, gfa.containment("1", "0").to_s)
+ gfa << l
+ assert_not_equal([], gfa.links)
+ gfa.rm(gfa.link_from_to(["1","+"],["2","+"]))
+ assert_equal([], gfa.links)
+ end
+
+ def test_delete_containments
+ gfa = RGFA.new
+ s = ["S\t0\t*", "S\t1\t*", "S\t2\t*"]
+ l = "L\t1\t+\t2\t+\t12M"
+ c = "C\t1\t+\t0\t+\t12\t12M"
+ (s + [l,c]).each {|line| gfa << line }
+ gfa.delete_containment(gfa.containment("1", "0"))
+ assert_equal([], gfa.containments)
+ assert_equal(nil, gfa.containment("1", "0"))
+ gfa << c
+ assert_not_equal([], gfa.containments)
+ assert_equal(c, gfa.containment("1", "0").to_s)
+ gfa.rm(gfa.containment("1", "0"))
+ assert_equal([], gfa.containments)
+ end
+
+ def test_unconnect_segments
+ gfa = RGFA.new
+ s = ["S\t0\t*", "S\t1\t*", "S\t2\t*"]
+ l = "L\t1\t+\t2\t+\t12M"
+ c = "C\t1\t+\t0\t+\t12\t12M"
+ (s + [l,c]).each {|line| gfa << line }
+ gfa.unconnect_segments("0", "1")
+ gfa.unconnect_segments("2", "1")
+ assert_equal([], gfa.containments)
+ assert_equal([], gfa.links)
+ end
+
+ def test_delete_segment
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ s = ["S\t0\t*", "S\t1\t*", "S\t2\t*"]
+ l = "L\t1\t+\t2\t+\t12M"
+ c = "C\t1\t+\t0\t+\t12\t12M"
+ p = "P\t4\t2+,0-\t12M"
+ (s + [l,c,p]).each {|line| gfa << line }
+ assert_equal(s, gfa.segments.map(&:to_s))
+ assert_equal([:"0", :"1", :"2"], gfa.segment_names)
+ assert_equal([l], gfa.links.select{|n|!n.virtual?}.map(&:to_s))
+ assert_equal([c], gfa.containments.map(&:to_s))
+ assert_equal([p], gfa.paths.map(&:to_s))
+ assert_equal([:"4"], gfa.path_names)
+ gfa.delete_segment("0")
+ assert_equal([s[1],s[2]], gfa.segments.map(&:to_s))
+ assert_equal([:"1", :"2"], gfa.segment_names)
+ assert_equal([l], gfa.links.select{|n|!n.virtual?}.map(&:to_s))
+ assert_equal([], gfa.containments.map(&:to_s))
+ assert_equal([], gfa.paths.map(&:to_s))
+ assert_equal([], gfa.path_names)
+ gfa.delete_segment("1")
+ assert_equal([s[2]], gfa.segments.map(&:to_s))
+ assert_equal([], gfa.links)
+ gfa.rm("2")
+ assert_equal([], gfa.segments)
+ end
+
+end
diff --git a/test/test_rgfa_line_getters.rb b/test/test_rgfa_line_getters.rb
new file mode 100644
index 0000000..47a61b7
--- /dev/null
+++ b/test/test_rgfa_line_getters.rb
@@ -0,0 +1,246 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineGetters < Test::Unit::TestCase
+
+ def test_headers
+ h = ["H\tVN:Z:1.0"]
+ assert_equal(h, h.to_rgfa.headers.map(&:to_s))
+ end
+
+ def test_each_header
+ h1 = ["H\tVN:Z:1.0"]
+ h2 = []
+ gfa = h1.to_rgfa
+ gfa.headers.each {|h| h2 << h.to_s}
+ assert_equal(h1, h2)
+ end
+
+ def test_segments
+ s = ["S\t1\t*","S\t2\t*"]
+ gfa = s.to_rgfa
+ assert_equal(s, gfa.segments.map(&:to_s))
+ gfa.delete_segment("1")
+ assert_equal([s[1]], gfa.segments.map(&:to_s))
+ end
+
+ def test_each_segment
+ s1 = ["S\t1\t*","S\t2\t*"]
+ s2 = []
+ gfa = s1.to_rgfa
+ gfa.segments.each {|s| s2 << s.to_s}
+ assert_equal(s1, s2)
+ gfa.delete_segment("1")
+ s2 = []
+ gfa.segments.each {|s| s2 << s.to_s}
+ assert_equal([s1[1]], s2)
+ end
+
+ def test_links
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ l = ["L\t1\t+\t2\t+\t12M", "L\t1\t+\t3\t+\t12M"]
+ gfa = (s+l).to_rgfa
+ assert_equal(l, gfa.links.map(&:to_s))
+ gfa.unconnect_segments("1","2")
+ assert_equal([l[1]], gfa.links.map(&:to_s))
+ end
+
+ def test_each_link
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ l1 = ["L\t1\t+\t2\t+\t12M", "L\t1\t+\t3\t+\t12M"]
+ gfa = (s+l1).to_rgfa
+ l2 = []
+ gfa.links.each {|l| l2 << l.to_s}
+ assert_equal(l1, l2)
+ gfa.unconnect_segments("1","2")
+ l2 = []
+ gfa.links.each {|l| l2 << l.to_s}
+ assert_equal([l1[1]],l2)
+ end
+
+ def test_containments
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ c = ["C\t1\t+\t2\t+\t12\t12M", "C\t1\t+\t3\t+\t12\t12M"]
+ gfa = (s+c).to_rgfa
+ assert_equal(c, gfa.containments.map(&:to_s))
+ gfa.unconnect_segments("1","2")
+ assert_equal([c[1]], gfa.containments.map(&:to_s))
+ end
+
+ def test_each_containment
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ c1 = ["C\t1\t+\t2\t+\t12\t12M", "C\t1\t+\t3\t+\t12\t12M"]
+ gfa = (s+c1).to_rgfa
+ c2 = []
+ gfa.containments.each {|c| c2 << c.to_s}
+ assert_equal(c1, c2)
+ gfa.unconnect_segments("1","2")
+ c2 = []
+ gfa.containments.each {|c| c2 << c.to_s}
+ assert_equal([c1[1]], c2)
+ end
+
+ def test_paths
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ l = ["L\t1\t+\t2\t+\t122M", "L\t1\t+\t3\t+\t120M"]
+ pt = ["P\t4\t1+,2+\t122M", "P\t5\t1+,3+\t120M"]
+ gfa = (s+l+pt).to_rgfa
+ assert_equal(pt, gfa.paths.map(&:to_s))
+ gfa.delete_path("4")
+ assert_equal([pt[1]], gfa.paths.map(&:to_s))
+ end
+
+ def test_each_path
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ l = ["L\t1\t+\t2\t+\t122M", "L\t1\t+\t3\t+\t120M"]
+ pt1 = ["P\t4\t1+,2+\t122M", "P\t5\t1+,3+\t120M"]
+ gfa = (s+l+pt1).to_rgfa
+ pt2 = []
+ gfa.paths.each {|pt| pt2 << pt.to_s}
+ assert_equal(pt1, pt2)
+ gfa.delete_path("4")
+ pt2 = []
+ gfa.paths.each {|pt| pt2 << pt.to_s}
+ assert_equal([pt1[1]], pt2)
+ end
+
+ def test_segment
+ s = ["S\t1\t*","S\t2\t*"]
+ gfa = s.to_rgfa
+ assert_equal(s[0],gfa.segment("1").to_s)
+ assert_equal(s[0],gfa.segment!("1").to_s)
+ assert_equal(nil,gfa.segment("0"))
+ assert_raises(RGFA::LineMissingError) {gfa.segment!("0").to_s}
+ end
+
+ def test_path
+ s = ["S\t1\t*","S\t2\t*", "S\t3\t*"]
+ l = ["L\t1\t+\t2\t+\t122M", "L\t1\t+\t3\t+\t120M"]
+ pt = ["P\t4\t1+,2+\t122M", "P\t5\t1+,3+\t120M"]
+ gfa = (s+l+pt).to_rgfa
+ assert_equal(pt[0],gfa.path("4").to_s)
+ assert_equal(pt[0],gfa.path!("4").to_s)
+ assert_equal(nil,gfa.path("6"))
+ assert_raises(RGFA::LineMissingError) {gfa.path!("6").to_s}
+ end
+
+ def test_paths_with_segment
+ gfa = RGFA.new
+ s = (0..3).map{|i| "S\t#{i}\t*".to_rgfa_line}
+ p = "P\t4\t2+,0-\t*"
+ (s + [p]).each {|line| gfa << line }
+ assert_equal([p], gfa.paths_with("0").map(&:to_s))
+ assert_equal([p], gfa.paths_with("2").map(&:to_s))
+ assert_equal([], gfa.paths_with("1").map(&:to_s))
+ end
+
+ def test_containing
+ gfa = RGFA.new
+ (0..2).each{|i| gfa << "S\t#{i}\t*"}
+ c = "C\t1\t+\t0\t+\t0\t*"
+ gfa << c
+ assert_equal([c], gfa.containing("0").map(&:to_s))
+ assert_equal([], gfa.containing("1"))
+ assert_equal([], gfa.containing("2"))
+ end
+
+ def test_contained_in
+ gfa = RGFA.new
+ (0..2).each{|i| gfa << "S\t#{i}\t*"}
+ c = "C\t1\t+\t0\t+\t0\t*"
+ gfa << c
+ assert_equal([], gfa.contained_in("0"))
+ assert_equal([c], gfa.contained_in("1").map(&:to_s))
+ assert_equal([], gfa.contained_in("2"))
+ end
+
+ def test_containments_between
+ gfa = RGFA.new
+ (0..2).each{|i| gfa << "S\t#{i}\t*"}
+ c1 = "C\t1\t+\t0\t+\t0\t*"
+ c2 = "C\t1\t+\t0\t+\t12\t*"
+ gfa << c1
+ gfa << c2
+ assert_equal([], gfa.containments_between("0", "1"))
+ assert_equal([c1,c2], gfa.containments_between("1", "0").map(&:to_s))
+ end
+
+ def test_containment
+ gfa = RGFA.new
+ (0..2).each{|i| gfa << "S\t#{i}\t*"}
+ c1 = "C\t1\t+\t0\t+\t0\t*"
+ c2 = "C\t1\t+\t0\t+\t12\t*"
+ gfa << c1
+ gfa << c2
+ assert_equal(nil, gfa.containment("0", "1"))
+ assert_raises(RGFA::LineMissingError) {gfa.containment!("0", "1")}
+ assert_equal(c1, gfa.containment("1", "0").to_s)
+ assert_equal(c1, gfa.containment!("1", "0").to_s)
+ end
+
+ def test_links_of
+ gfa = RGFA.new
+ (0..3).each{|i| gfa << "S\t#{i}\t*"}
+ l0 = "L\t1\t+\t2\t+\t*"; gfa << l0
+ l1 = "L\t0\t+\t1\t+\t*"; gfa << l1
+ l2 = "L\t1\t+\t3\t+\t*"; gfa << l2
+ assert_equal([], gfa.links_of(["0", :B]).map(&:to_s))
+ assert_equal([l1], gfa.links_of(["0", :E]).map(&:to_s))
+ assert_equal([l1], gfa.links_of(["1", :B]).map(&:to_s))
+ assert_equal([l0,l2], gfa.links_of(["1", :E]).map(&:to_s))
+ assert_equal([l0], gfa.links_of(["2", :B]).map(&:to_s))
+ assert_equal([], gfa.links_of(["2", :E]).map(&:to_s))
+ assert_equal([l2], gfa.links_of(["3", :B]).map(&:to_s))
+ assert_equal([], gfa.links_of(["3", :E]).map(&:to_s))
+ gfa = RGFA.new
+ (0..3).each{|i| gfa << "S\t#{i}\t*"}
+ l0 = "L\t1\t+\t2\t-\t*"; gfa << l0
+ l1 = "L\t0\t+\t1\t-\t*"; gfa << l1
+ l2 = "L\t1\t-\t3\t+\t*"; gfa << l2
+ assert_equal([], gfa.links_of(["0", :B]).map(&:to_s))
+ assert_equal([l1], gfa.links_of(["0", :E]).map(&:to_s))
+ assert_equal([l2], gfa.links_of(["1", :B]).map(&:to_s))
+ assert_equal([l0,l1], gfa.links_of(["1", :E]).map(&:to_s))
+ assert_equal([], gfa.links_of(["2", :B]).map(&:to_s))
+ assert_equal([l0], gfa.links_of(["2", :E]).map(&:to_s))
+ assert_equal([l2], gfa.links_of(["3", :B]).map(&:to_s))
+ assert_equal([], gfa.links_of(["3", :E]).map(&:to_s))
+ end
+
+ def test_links_between
+ gfa = RGFA.new
+ (0..3).each{|i| gfa << "S\t#{i}\t*"}
+ l0 = "L\t1\t+\t2\t+\t11M1D3M"; gfa << l0
+ l1 = "L\t1\t+\t2\t+\t10M2D3M"; gfa << l1
+ l2 = "L\t1\t+\t3\t+\t*"; gfa << l2
+ assert_equal([l0, l1], gfa.links_between(["1", :E], ["2", :B]).map(&:to_s))
+ assert_equal([], gfa.links_between(["1", :E], ["2", :E]).map(&:to_s))
+ end
+
+ def test_link
+ gfa = RGFA.new
+ (0..3).each{|i| gfa << "S\t#{i}\t*"}
+ l0 = "L\t1\t+\t2\t+\t11M1D3M"; gfa << l0
+ l1 = "L\t1\t+\t2\t+\t10M2D3M"; gfa << l1
+ l2 = "L\t1\t+\t3\t+\t*"; gfa << l2
+ assert_equal(l0, gfa.link(["1", :E], ["2", :B]).to_s)
+ assert_equal(l0, gfa.link!(["1", :E], ["2", :B]).to_s)
+ assert_equal(nil, gfa.link(["1", :E], ["2", :E]))
+ assert_raise(RGFA::LineMissingError) { gfa.link!(["1", :E], ["2", :E]) }
+ end
+
+ def test_header_tags
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ gfa << "H\taa:i:12\tab:Z:test1"
+ gfa << "H\taa:i:15"
+ gfa << "H\tac:Z:test2"
+ assert_equal([[:VN, :Z, "1.0"],
+ [:aa, :i, 12],
+ [:aa, :i, 15],
+ [:ab, :Z, "test1"],
+ [:ac, :Z, "test2"]].sort,
+ gfa.header.tags.sort)
+ end
+
+end
diff --git a/test/test_rgfa_line_header.rb b/test/test_rgfa_line_header.rb
new file mode 100644
index 0000000..25b3d2b
--- /dev/null
+++ b/test/test_rgfa_line_header.rb
@@ -0,0 +1,17 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineHeader < Test::Unit::TestCase
+
+ def test_from_string
+ assert_nothing_raised { "H\tVN:Z:1.0".to_rgfa_line }
+ assert_equal(RGFA::Line::Header, "H\tVN:Z:1.0".to_rgfa_line.class)
+ assert_raises(RGFA::FieldParser::FormatError) do
+ "H\tH2\tVN:Z:1.0".to_rgfa_line
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ "H\tVN:i:1.0".to_rgfa_line
+ end
+ end
+
+end
diff --git a/test/test_rgfa_line_link.rb b/test/test_rgfa_line_link.rb
new file mode 100644
index 0000000..702cc8f
--- /dev/null
+++ b/test/test_rgfa_line_link.rb
@@ -0,0 +1,43 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineLink < Test::Unit::TestCase
+
+ def test_from_string
+ fields=["L","1","+","2","-","12M","RC:i:1232","NM:i:3","ab:Z:abcd",
+ "FC:i:2321","KC:i:1212","MQ:i:40"]
+ str=fields.join("\t")
+ assert_nothing_raised { str.to_rgfa_line }
+ assert_equal(RGFA::Line::Link, str.to_rgfa_line.class)
+ assert_equal(fields[0].to_sym, str.to_rgfa_line.record_type)
+ assert_equal(fields[1].to_sym, str.to_rgfa_line.from)
+ assert_equal(fields[2].to_sym, str.to_rgfa_line.from_orient)
+ assert_equal(fields[3].to_sym, str.to_rgfa_line.to)
+ assert_equal(fields[4].to_sym, str.to_rgfa_line.to_orient)
+ assert_equal([RGFA::CIGAR::Operation.new(12,:M)], str.to_rgfa_line.overlap)
+ assert_equal(1232, str.to_rgfa_line.RC)
+ assert_equal(3, str.to_rgfa_line.NM)
+ assert_equal(2321, str.to_rgfa_line.FC)
+ assert_equal(1212, str.to_rgfa_line.KC)
+ assert_equal(40, str.to_rgfa_line.MQ)
+ assert_equal("abcd", str.to_rgfa_line.ab)
+ assert_raises(RGFA::FieldParser::FormatError) { (str+"\tH1").to_rgfa_line }
+ assert_raises(RGFA::Line::RequiredFieldMissingError) { "L\tH".to_rgfa_line }
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[2]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[4]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::CIGAR::ValueError) do
+ f=fields.dup; f[5]="x"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ f=fields.dup; f[6]="RC:Z:1232"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ f=fields.dup; f[7]="NM:Z:1232"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ end
+
+end
diff --git a/test/test_rgfa_line_path.rb b/test/test_rgfa_line_path.rb
new file mode 100644
index 0000000..beb8ee9
--- /dev/null
+++ b/test/test_rgfa_line_path.rb
@@ -0,0 +1,48 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALinePath < Test::Unit::TestCase
+
+ def test_from_string
+ fields=["P","4","1+,2-,3+","9M2I3D1M,12M","ab:Z:abcd"]
+ str=fields.join("\t")
+ assert_nothing_raised { str.to_rgfa_line }
+ assert_equal(RGFA::Line::Path, str.to_rgfa_line.class)
+ assert_equal(fields[0].to_sym, str.to_rgfa_line.record_type)
+ assert_equal(fields[1].to_sym, str.to_rgfa_line.path_name)
+ assert_equal([[:"1",:"+"],[:"2",:"-"],[:"3",:"+"]],
+ str.to_rgfa_line.segment_names)
+ assert_equal([[RGFA::CIGAR::Operation.new(9,:M),
+ RGFA::CIGAR::Operation.new(2,:I),
+ RGFA::CIGAR::Operation.new(3,:D),
+ RGFA::CIGAR::Operation.new(1,:M)],
+ [RGFA::CIGAR::Operation.new(12,:M)]],
+ str.to_rgfa_line.cigars)
+ assert_equal("abcd", str.to_rgfa_line.ab)
+ assert_raises(RGFA::FieldParser::FormatError) { (str+"\tH1").to_rgfa_line }
+ assert_raises(RGFA::Line::RequiredFieldMissingError) { "P\tH".to_rgfa_line }
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[2]="1,2,3"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::Path::ListLengthsError) do
+ f=fields.dup; f[2]="1+"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_nothing_raised do
+ f=fields.dup; f[3]="*,*"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_nothing_raised do
+ f=fields.dup; f[3]="9M2I3D1M,12M,12M"; f.join("\t").
+ to_rgfa_line(validate: 3)
+ end
+ assert_nothing_raised do
+ f=fields.dup; f[3]="*"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::CIGAR::ValueError) do
+ f=fields.dup; f[3]="12,12"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::CIGAR::ValueError) do
+ f=fields.dup; f[3]="12M|12M"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ end
+
+end
diff --git a/test/test_rgfa_line_segment.rb b/test/test_rgfa_line_segment.rb
new file mode 100644
index 0000000..1c21888
--- /dev/null
+++ b/test/test_rgfa_line_segment.rb
@@ -0,0 +1,64 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFALineSegment < Test::Unit::TestCase
+
+ def test_from_string
+ fields=["S","1","ACGTCACANNN","RC:i:1232","LN:i:11","ab:Z:abcd",
+ "FC:i:2321","KC:i:1212"]
+ str=fields.join("\t")
+ assert_nothing_raised { str.to_rgfa_line }
+ assert_equal(RGFA::Line::Segment, str.to_rgfa_line.class)
+ assert_equal(fields[0].to_sym, str.to_rgfa_line.record_type)
+ assert_equal(fields[1].to_sym, str.to_rgfa_line.name)
+ assert_equal(fields[2], str.to_rgfa_line.sequence)
+ assert_equal(1232, str.to_rgfa_line.RC)
+ assert_equal(11, str.to_rgfa_line.LN)
+ assert_equal(2321, str.to_rgfa_line.FC)
+ assert_equal(1212, str.to_rgfa_line.KC)
+ assert_equal("abcd", str.to_rgfa_line.ab)
+ assert_raises(RGFA::FieldParser::FormatError) { (str+"\tH1").to_rgfa_line }
+ assert_raises(RGFA::Line::RequiredFieldMissingError) { "S\tH".to_rgfa_line }
+ assert_raises(RGFA::FieldParser::FormatError) do
+ f=fields.dup; f[2]="!@#?"; f.join("\t").to_rgfa_line(validate: 3)
+ end
+ assert_raises(RGFA::Line::PredefinedOptfieldTypeError) do
+ f=fields.dup; f[3]="RC:Z:1232"; f.join("\t").to_rgfa_line
+ end
+ f=["S","2","ACGTCACANNN","LN:i:3"]
+ assert_raises(RGFA::Line::Segment::InconsistentLengthError) do
+ f.join("\t").to_rgfa_line(validate: 3)
+ end
+ f=["S","2","ACGTCACANNN","LN:i:11"]
+ assert_nothing_raised { f.join("\t").to_rgfa_line }
+ f=["S","2","*","LN:i:3"]
+ assert_nothing_raised { f.join("\t").to_rgfa_line }
+ end
+
+ def test_coverage
+ l = "S\t0\t*\tRC:i:600\tLN:i:100".to_rgfa_line
+ assert_equal(6, l.coverage)
+ assert_equal(6, l.coverage!)
+ l = "S\t0\t*\tRC:i:600".to_rgfa_line
+ assert_equal(nil, l.coverage)
+ assert_raises(RGFA::Line::Segment::UndefinedLengthError) {l.coverage!}
+ l = "S\t0\t*\tLN:i:100".to_rgfa_line
+ assert_equal(nil, l.coverage)
+ assert_raises(RGFA::Line::TagMissingError) {l.coverage!}
+ l = "S\t0\t*\tFC:i:600\tLN:i:100".to_rgfa_line
+ assert_equal(nil, l.coverage)
+ assert_raises(RGFA::Line::TagMissingError) {l.coverage!}
+ assert_equal(6, l.coverage(count_tag: :FC))
+ assert_equal(6, l.coverage!(count_tag: :FC))
+ end
+
+ def test_other_orientation
+ assert_equal(:+, RGFA::OrientedSegment.invert("-"))
+ assert_equal(:-, RGFA::OrientedSegment.invert("+"))
+ assert_equal(:-, RGFA::OrientedSegment.invert(:+))
+ assert_raises(RGFA::SegmentInfo::InvalidAttributeError) do
+ RGFA::OrientedSegment.invert("x")
+ end
+ end
+
+end
diff --git a/test/test_rgfa_segment_references.rb b/test/test_rgfa_segment_references.rb
new file mode 100644
index 0000000..2503d51
--- /dev/null
+++ b/test/test_rgfa_segment_references.rb
@@ -0,0 +1,20 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFASegmentReferences < Test::Unit::TestCase
+
+ def test_link_other
+ l = "L\t1\t+\t2\t-\t*".to_rgfa_line
+ assert_equal(:"2", l.other(:"1"))
+ assert_equal(:"1", l.other(:"2"))
+ assert_raise(RGFA::LineMissingError){l.other(:"0")}
+ end
+
+ def test_link_circular
+ l = "L\t1\t+\t2\t-\t*".to_rgfa_line
+ assert_equal(false, l.circular?)
+ l = "L\t1\t+\t1\t-\t*".to_rgfa_line
+ assert_equal(true, l.circular?)
+ end
+
+end
diff --git a/test/test_rgfa_sequence.rb b/test/test_rgfa_sequence.rb
new file mode 100644
index 0000000..91d8bd7
--- /dev/null
+++ b/test/test_rgfa_sequence.rb
@@ -0,0 +1,19 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFASequence < Test::Unit::TestCase
+
+ def test_rc
+ assert_equal("gcatcgatcgt","acgatcgatgc".rc)
+ assert_equal("gCaTCgatcgt","acgatcGAtGc".rc)
+ assert_equal("gcatcnatcgt","acgatngatgc".rc)
+ assert_equal("gcatcYatcgt","acgatRgatgc".rc)
+ assert_raises(RuntimeError){"acgatUgatgc".rc}
+ assert_equal("gcaucgaucgu","acgaucgaugc".rc)
+ assert_equal("===.",".===".rc)
+ assert_raises(RuntimeError){"acgatXgatgc".rc}
+ assert_equal("*","*".rc)
+ assert_raises(RuntimeError){"**".rc}
+ end
+
+end
diff --git a/test/test_rgfa_traverse.rb b/test/test_rgfa_traverse.rb
new file mode 100644
index 0000000..6976cd9
--- /dev/null
+++ b/test/test_rgfa_traverse.rb
@@ -0,0 +1,96 @@
+require_relative "../lib/rgfa.rb"
+require "test/unit"
+
+class TestRGFATraverse < Test::Unit::TestCase
+
+ def test_linear_path_merging
+ s = ["S\t0\tACGA",
+ "S\t1\tACGA",
+ "S\t2\tACGA",
+ "S\t3\tACGA"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t+\t3\t+\t1M"]
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ (s + l).each {|line| gfa << line }
+ assert_raises(ArgumentError) do
+ gfa.merge_linear_path([["0", :E],["1", :E],["2", :B],["3", :E]])
+ end
+ s = ["S\t0\tACGA",
+ "S\t1\tACGA",
+ "S\t2\tACGT",
+ "S\t3\tTCGA"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"]
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ (s + l).each {|line| gfa << line }
+ assert_nothing_raised do
+ gfa.merge_linear_path([["0", :E],["1", :E],["2", :B],["3", :E]])
+ end
+ assert_raises(RGFA::LineMissingError) {gfa.segment!("0")}
+ assert_raises(RGFA::LineMissingError) {gfa.segment!("1")}
+ assert_raises(RGFA::LineMissingError) {gfa.segment!("2")}
+ assert_raises(RGFA::LineMissingError) {gfa.segment!("3")}
+ assert_nothing_raised {gfa.segment!("0_1_2_3")}
+ assert_equal([], gfa.links)
+ assert_equal("ACGACGACGTCGA", gfa.segment("0_1_2_3").sequence)
+ end
+
+ def test_linear_path_merge_all
+ s = ["S\t0\t*",
+ "S\t1\t*",
+ "S\t2\t*",
+ "S\t3\t*"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"]
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ (s + l).each {|line| gfa << line }
+ gfa.merge_linear_paths
+ assert_nothing_raised { gfa.merge_linear_paths }
+ assert_equal([:"0_1_2_3"], gfa.segment_names)
+ assert_equal(1, gfa.segments.size)
+ assert_equal([], gfa.links)
+ s = ["S\t0\t*",
+ "S\t1\t*",
+ "S\t2\t*",
+ "S\t3\t*"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t0\t+\t2\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"].map(&:to_rgfa_line)
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ (s + l).each {|line| gfa << line }
+ assert_nothing_raised { gfa.merge_linear_paths }
+ assert_equal(3, gfa.segments.size)
+ assert_equal([:"0",:"3",:"1_2"], gfa.segments.map(&:name))
+ s = ["S\t0\t*",
+ "S\t1\t*",
+ "S\t2\t*",
+ "S\t3\t*"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t0\t+\t2\t+\t1M",
+ "L\t1\t+\t2\t+\t1M",
+ "L\t2\t+\t3\t+\t1M"].map(&:to_rgfa_line)
+ gfa = RGFA.new
+ gfa << "H\tVN:Z:1.0"
+ (s + l).each {|line| gfa << line }
+ assert_nothing_raised { gfa.merge_linear_paths }
+ assert_equal(3, gfa.segments.size)
+ assert_equal([:"0", :"1", :"2_3"], gfa.segments.map(&:name))
+ end
+
+ def test_linear_path_merge_example1
+ gfa = RGFA.from_file("test/testdata/example1.gfa")
+ assert_equal([%w[18 19 1],
+ %w[11 9 12],
+ %w[22 16 20 21 23]],
+ gfa.linear_paths.map{|sp|sp.map{|sn,et|sn.to_sym.to_s}})
+ end
+
+end
diff --git a/test/test_rgfatools.rb b/test/test_rgfatools.rb
new file mode 100644
index 0000000..aef0047
--- /dev/null
+++ b/test/test_rgfatools.rb
@@ -0,0 +1,11 @@
+require_relative "../lib/rgfatools.rb"
+require "test/unit"
+
+class TestRGFATools < Test::Unit::TestCase
+
+ def test_basics
+ assert_nothing_raised { RGFA.new }
+ assert_nothing_raised { RGFA.included_modules.include?(RGFATools) }
+ end
+
+end
diff --git a/test/test_rgfatools_artifacts.rb b/test/test_rgfatools_artifacts.rb
new file mode 100644
index 0000000..8460018
--- /dev/null
+++ b/test/test_rgfatools_artifacts.rb
@@ -0,0 +1,34 @@
+require_relative "../lib/rgfatools.rb"
+require "test/unit"
+
+class TestRGFAToolsArtifacts < Test::Unit::TestCase
+
+ def test_remove_small_components
+ g = RGFA.from_file("test/testdata/two_components.gfa")
+ assert_equal(2, g.connected_components.size)
+ g.remove_small_components(1000)
+ assert_equal(2, g.connected_components.size)
+ g.remove_small_components(3000)
+ assert_equal(1, g.connected_components.size)
+ g.remove_small_components(10000)
+ assert_equal(0, g.connected_components.size)
+ end
+
+ def test_remove_dead_ends
+ g = RGFA.from_file("test/testdata/dead_ends.gfa")
+ assert_equal(6, g.segments.size)
+ g.remove_dead_ends(100)
+ assert_equal(6, g.segments.size)
+ g.remove_dead_ends(1500)
+ assert_equal(5, g.segments.size)
+ g.remove_dead_ends(1500)
+ assert_equal(5, g.segments.size)
+ g.remove_dead_ends(150000)
+ assert_equal(3, g.segments.size)
+ g.remove_dead_ends(150000)
+ assert_equal(2, g.segments.size)
+ g.remove_dead_ends(1500000)
+ assert_equal(0, g.segments.size)
+ end
+
+end
diff --git a/test/test_rgfatools_copy_number.rb b/test/test_rgfatools_copy_number.rb
new file mode 100644
index 0000000..a88c61a
--- /dev/null
+++ b/test/test_rgfatools_copy_number.rb
@@ -0,0 +1,44 @@
+require_relative "../lib/rgfatools.rb"
+require "test/unit"
+
+class TestRGFAToolsCopyNumber < Test::Unit::TestCase
+
+ def test_delete_low_coverage_segments
+ gfa = ["S\t0\t*\tRC:i:600\tLN:i:100",
+ "S\t1\t*\tRC:i:6000\tLN:i:100",
+ "S\t2\t*\tRC:i:60000\tLN:i:100"].to_rgfa
+ assert_equal([:"0",:"1",:"2"], gfa.segment_names)
+ gfa.delete_low_coverage_segments(10)
+ assert_equal([:"1",:"2"], gfa.segment_names)
+ gfa.delete_low_coverage_segments(100)
+ assert_equal([:"2"], gfa.segment_names)
+ gfa.delete_low_coverage_segments(1000)
+ assert_equal([], gfa.segment_names)
+ end
+
+ def test_compute_copy_numbers
+ gfa = ["S\t0\t*\tRC:i:10\tLN:i:100",
+ "S\t1\t*\tRC:i:1000\tLN:i:100",
+ "S\t2\t*\tRC:i:2000\tLN:i:100",
+ "S\t3\t*\tRC:i:3000\tLN:i:100"].to_rgfa
+ assert_nothing_raised { gfa.compute_copy_numbers(9) }
+ assert_equal(0, gfa.segment!("0").cn)
+ assert_equal(1, gfa.segment!("1").cn)
+ assert_equal(2, gfa.segment!("2").cn)
+ assert_equal(3, gfa.segment!("3").cn)
+ end
+
+ def test_apply_copy_number
+ gfa = ["S\t0\t*\tRC:i:10\tLN:i:100",
+ "S\t1\t*\tRC:i:1000\tLN:i:100",
+ "S\t2\t*\tRC:i:2000\tLN:i:100",
+ "S\t3\t*\tRC:i:3000\tLN:i:100"].to_rgfa
+ assert_equal([:"0",:"1",:"2",:"3"], gfa.segment_names)
+ gfa.compute_copy_numbers(9)
+ gfa.apply_copy_numbers
+ assert_equal([:"1",:"2",:"3",:"2b",:"3b",:"3c"], gfa.segment_names)
+ gfa.compute_copy_numbers(9)
+ assert(gfa.segments.map(&:cn).all?{|cn|cn == 1})
+ end
+
+end
diff --git a/test/test_rgfatools_linear_paths.rb b/test/test_rgfatools_linear_paths.rb
new file mode 100644
index 0000000..2bb9f0f
--- /dev/null
+++ b/test/test_rgfatools_linear_paths.rb
@@ -0,0 +1,52 @@
+require_relative "../lib/rgfatools.rb"
+require "test/unit"
+
+class TestRGFAToolsLinearPaths < Test::Unit::TestCase
+
+ def test_linear_path_merging
+ s = ["S\t0\tACGA",
+ "S\t1\tACGA",
+ "S\t2\tACGA",
+ "S\t3\tACGA"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"]
+ gfa = RGFA.new
+ (s + l).each {|line| gfa << line }
+ gfa.merge_linear_path([["0", :E],["1", :E],["2", :B],["3", :E]],
+ enable_tracking: true)
+ assert_nothing_raised {gfa.segment!("0_1_2^_3")}
+ assert_equal("ACGACGACGTCGA", gfa.segment("0_1_2^_3").sequence)
+ gfa = RGFA.new
+ gfa.enable_extensions
+ (s + l).each {|line| gfa << line }
+ gfa.merge_linear_path([["0", :E],["1", :E],["2", :B],["3", :E]])
+ assert_nothing_raised {gfa.segment!("0_1_2^_3")}
+ assert_equal("ACGACGACGTCGA", gfa.segment("0_1_2^_3").sequence)
+ end
+
+ def test_linear_path_merge_all
+ s = ["S\t0\t*",
+ "S\t1\t*",
+ "S\t2\t*",
+ "S\t3\t*"]
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"]
+ gfa = RGFA.new
+ gfa.enable_extensions
+ (s + l).each {|line| gfa << line }
+ gfa.merge_linear_paths
+ assert_equal([:"0_1_2^_3"], gfa.segment_names)
+ l = ["L\t0\t+\t1\t+\t1M",
+ "L\t0\t+\t2\t+\t1M",
+ "L\t1\t+\t2\t-\t1M",
+ "L\t2\t-\t3\t+\t1M"].map(&:to_rgfa_line)
+ gfa = RGFA.new
+ gfa.enable_extensions
+ (s + l).each {|line| gfa << line }
+ gfa.merge_linear_paths
+ assert_equal([:"0",:"3",:"1_2^"], gfa.segments.map(&:name))
+ end
+
+end
diff --git a/test/test_rgfatools_multiplication.rb b/test/test_rgfatools_multiplication.rb
new file mode 100644
index 0000000..1f318ab
--- /dev/null
+++ b/test/test_rgfatools_multiplication.rb
@@ -0,0 +1,183 @@
+require_relative "../lib/rgfatools.rb"
+require "test/unit"
+
+class TestRGFAToolsMuliplication < Test::Unit::TestCase
+
+ def test_links_distribution_l1_m2
+ g1 = RGFA.from_file("test/testdata/links_distri.l1.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l1.m2.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_enable_extensions
+ g1 = RGFA.from_file("test/testdata/links_distri.l1.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l1.m2.gfa")
+ g1.enable_extensions
+ g2.enable_extensions
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply(:"1", 2)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_links_distribution_l2_m2
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m2.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_no_links_distribution_l2_m2
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m2.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :off)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_links_distribution_l2_m3
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m3.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 3)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_no_links_distribution_l2_m3
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m3.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 3, distribute: :off)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_links_distribution_l3_m2
+ g1 = RGFA.from_file("test/testdata/links_distri.l3.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l3.m2.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_no_links_distribution_l3_m2
+ g1 = RGFA.from_file("test/testdata/links_distri.l3.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l3.m2.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :off)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_muliply_without_rgfatools
+ g1 = RGFA.from_file("test/testdata/links_distri.l3.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l3.m2.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply(:"1", 2)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_distribution_policy_equal_with_equal
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m2.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :equal)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_distribution_policy_equal_with_not_equal
+ g1 = RGFA.from_file("test/testdata/links_distri.l3.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l3.m2.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :equal)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_distribution_policy_B
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m2.no_ld.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :B)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_distribution_policy_E
+ g1 = RGFA.from_file("test/testdata/links_distri.l2.gfa")
+ g2 = RGFA.from_file("test/testdata/links_distri.l2.m2.gfa")
+ assert_not_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_not_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ g1.multiply_extended(:"1", 2, distribute: :E)
+ assert_equal(g2.segment_names.sort,g1.segment_names.sort)
+ assert_equal(g2.links.map(&:to_s).sort, g1.links.map(&:to_s).sort)
+ end
+
+ def test_auto_select_distribute_end_lB_eq_lE
+ g = RGFA.new
+ # lB == lE == 1
+ assert_equal(nil, g.send(:auto_select_distribute_end, 4, 1, 1, false))
+ # lB == lE == factor
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 4, 4, false))
+ # lB == lE; </> factor
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 2, 2, false))
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 6, 6, false))
+ end
+
+ def test_auto_select_distribute_end_l_1
+ g = RGFA.new
+ # lB or lE == 1, other </==/> factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 2, 1, false))
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 4, 1, false))
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 6, 1, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 1, 2, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 1, 4, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 1, 6, false))
+ end
+
+ def test_auto_select_distribute_end_eq_factor
+ g = RGFA.new
+ # one =, one > factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 4, 5, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 5, 4, false))
+ # one =, one < factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 4, 3, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 3, 4, false))
+ end
+
+ def test_auto_select_distribute_end_diff_factor
+ g = RGFA.new
+ # both > 1; both < factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 3, 2, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 2, 3, false))
+ # both > 1; both > factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 5, 6, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 6, 5, false))
+ # both > 1; one <, one > factor
+ assert_equal(:B, g.send(:auto_select_distribute_end, 4, 3, 5, false))
+ assert_equal(:E, g.send(:auto_select_distribute_end, 4, 5, 3, false))
+ end
+
+end
diff --git a/test/testdata/dead_ends.gfa b/test/testdata/dead_ends.gfa
new file mode 100644
index 0000000..751cd15
--- /dev/null
+++ b/test/testdata/dead_ends.gfa
@@ -0,0 +1,12 @@
+H VN:Z:1.0
+S 1 * LN:i:1000000
+S 2 * LN:i:1000000
+S 3 * LN:i:100000
+S 3b * LN:i:100000
+S 4 * LN:i:10000
+S 4b * LN:i:1000
+L 1 + 2 + *
+L 2 + 3 + *
+L 2 + 3b + *
+L 3 + 4 + *
+L 3 + 4b + *
diff --git a/test/testdata/example1.gfa b/test/testdata/example1.gfa
new file mode 100644
index 0000000..cfdf6ab
--- /dev/null
+++ b/test/testdata/example1.gfa
@@ -0,0 +1,45 @@
+H VN:Z:1.0
+S 1 * LN:i:6871 RC:i:2200067
+S 10 * LN:i:251 RC:i:82006
+S 11 * LN:i:208 RC:i:39533
+S 12 * LN:i:186 RC:i:34457
+S 16 * LN:i:157 RC:i:15334
+S 18 * LN:i:145 RC:i:55632
+S 19 * LN:i:134 RC:i:49274
+S 2 * LN:i:4589 RC:i:6428225
+S 20 * LN:i:134 RC:i:20521
+S 21 * LN:i:133 RC:i:28174
+S 22 * LN:i:132 RC:i:17846
+S 23 * LN:i:132 RC:i:24658
+S 24 * LN:i:107 RC:i:22256
+S 3 * LN:i:2044 RC:i:2727166
+S 4 * LN:i:1744 RC:i:1729157
+S 5 * LN:i:1378 RC:i:1071246
+S 6 * LN:i:1356 RC:i:422793
+S 7 * LN:i:920 RC:i:630822
+S 8 * LN:i:876 RC:i:794734
+S 9 * LN:i:255 RC:i:40589
+L 1 + 2 + *
+L 1 - 19 - *
+L 10 + 3 - *
+L 10 - 4 + *
+L 11 - 6 - *
+L 11 + 9 - *
+L 12 + 9 + *
+L 12 - 18 + *
+L 16 + 20 + *
+L 16 - 22 - *
+L 18 + 19 + *
+L 18 - 23 + *
+L 2 + 5 + *
+L 2 + 5 - *
+L 2 - 8 + *
+L 20 + 21 + *
+L 21 + 23 - *
+L 22 - 6 - *
+L 24 + 7 + *
+L 24 - 7 + *
+L 3 + 4 - *
+L 3 - 6 + *
+L 3 - 8 - *
+L 4 - 7 - *
diff --git a/test/testdata/example_from_spec.gfa b/test/testdata/example_from_spec.gfa
new file mode 100644
index 0000000..d819013
--- /dev/null
+++ b/test/testdata/example_from_spec.gfa
@@ -0,0 +1,9 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md#example
+S 11 ACCTT
+S 12 TCAAGG
+S 13 CTTGATT
+L 11 + 12 - 4M
+L 12 - 13 + 5M
+L 11 + 13 + 3M
+P 14 11+,12-,13+ 4M,5M
diff --git a/test/testdata/example_from_spec.path14.seq b/test/testdata/example_from_spec.path14.seq
new file mode 100644
index 0000000..65069d9
--- /dev/null
+++ b/test/testdata/example_from_spec.path14.seq
@@ -0,0 +1 @@
+ACCTTGATT
diff --git a/test/testdata/example_from_spec2.gfa b/test/testdata/example_from_spec2.gfa
new file mode 100644
index 0000000..e65684e
--- /dev/null
+++ b/test/testdata/example_from_spec2.gfa
@@ -0,0 +1,13 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/blob/master/GFA-spec.md#first-update-on-gfa
+S 1 CGATGCAA
+L 1 + 2 + 5M
+S 2 TGCAAAGTAC
+L 3 + 2 + 0M
+S 3 TGCAACGTATAGACTTGTCAC RC:i:4
+L 3 + 4 - 1M1D2M1S
+S 4 GCATATA
+L 4 - 5 + 0M
+S 5 CGATGATA
+S 6 ATGA
+C 5 + 6 + 2 4M
diff --git a/test/testdata/links_distri.l1.gfa b/test/testdata/links_distri.l1.gfa
new file mode 100644
index 0000000..b3cf637
--- /dev/null
+++ b/test/testdata/links_distri.l1.gfa
@@ -0,0 +1,4 @@
+H VN:Z:1.0
+S 1 *
+S 2 *
+L 1 + 2 + *
diff --git a/test/testdata/links_distri.l1.m2.gfa b/test/testdata/links_distri.l1.m2.gfa
new file mode 100644
index 0000000..5471921
--- /dev/null
+++ b/test/testdata/links_distri.l1.m2.gfa
@@ -0,0 +1,6 @@
+H VN:Z:1.0
+S 1 *
+S 1b *
+S 2 *
+L 1 + 2 + *
+L 1b + 2 + *
diff --git a/test/testdata/links_distri.l2.gfa b/test/testdata/links_distri.l2.gfa
new file mode 100644
index 0000000..00a9d08
--- /dev/null
+++ b/test/testdata/links_distri.l2.gfa
@@ -0,0 +1,6 @@
+H VN:Z:1.0
+S 1 *
+S 2a *
+S 2b *
+L 1 + 2a + *
+L 1 + 2b + *
diff --git a/test/testdata/links_distri.l2.m2.gfa b/test/testdata/links_distri.l2.m2.gfa
new file mode 100644
index 0000000..81a1d4c
--- /dev/null
+++ b/test/testdata/links_distri.l2.m2.gfa
@@ -0,0 +1,7 @@
+H VN:Z:1.0
+S 1 * or:Z:1
+S 1b * or:Z:1
+S 2a *
+S 2b *
+L 1 + 2a + *
+L 1b + 2b + *
diff --git a/test/testdata/links_distri.l2.m2.no_ld.gfa b/test/testdata/links_distri.l2.m2.no_ld.gfa
new file mode 100644
index 0000000..5174f53
--- /dev/null
+++ b/test/testdata/links_distri.l2.m2.no_ld.gfa
@@ -0,0 +1,9 @@
+H VN:Z:1.0
+S 1 *
+S 2a *
+S 2b *
+S 1b *
+L 1 + 2a + *
+L 1 + 2b + *
+L 1b + 2a + *
+L 1b + 2b + *
diff --git a/test/testdata/links_distri.l2.m3.gfa b/test/testdata/links_distri.l2.m3.gfa
new file mode 100644
index 0000000..346131b
--- /dev/null
+++ b/test/testdata/links_distri.l2.m3.gfa
@@ -0,0 +1,8 @@
+H VN:Z:1.0
+S 1 * or:Z:1
+S 2a *
+S 2b *
+S 1b * or:Z:1
+S 1c * or:Z:1
+L 1 + 2a + *
+L 1b + 2b + *
diff --git a/test/testdata/links_distri.l2.m3.no_ld.gfa b/test/testdata/links_distri.l2.m3.no_ld.gfa
new file mode 100644
index 0000000..fff5b38
--- /dev/null
+++ b/test/testdata/links_distri.l2.m3.no_ld.gfa
@@ -0,0 +1,12 @@
+H VN:Z:1.0
+S 1 *
+S 2a *
+S 2b *
+S 1b *
+S 1c *
+L 1 + 2a + *
+L 1 + 2b + *
+L 1b + 2a + *
+L 1b + 2b + *
+L 1c + 2a + *
+L 1c + 2b + *
diff --git a/test/testdata/links_distri.l3.gfa b/test/testdata/links_distri.l3.gfa
new file mode 100644
index 0000000..6a8ae7a
--- /dev/null
+++ b/test/testdata/links_distri.l3.gfa
@@ -0,0 +1,8 @@
+H VN:Z:1.0
+S 1 *
+S 2a *
+S 2b *
+S 2c *
+L 1 + 2a + *
+L 1 + 2b + *
+L 1 + 2c + *
diff --git a/test/testdata/links_distri.l3.m2.gfa b/test/testdata/links_distri.l3.m2.gfa
new file mode 100644
index 0000000..fa70cc2
--- /dev/null
+++ b/test/testdata/links_distri.l3.m2.gfa
@@ -0,0 +1,10 @@
+H VN:Z:1.0
+S 1 * or:Z:1
+S 2a *
+S 2b *
+S 2c *
+S 1b * or:Z:1
+L 1 + 2a + *
+L 1 + 2b + *
+L 1b + 2b + *
+L 1b + 2c + *
diff --git a/test/testdata/links_distri.l3.m2.no_ld.gfa b/test/testdata/links_distri.l3.m2.no_ld.gfa
new file mode 100644
index 0000000..a8e2774
--- /dev/null
+++ b/test/testdata/links_distri.l3.m2.no_ld.gfa
@@ -0,0 +1,12 @@
+H VN:Z:1.0
+S 1 *
+S 2a *
+S 2b *
+S 2c *
+S 1b *
+L 1 + 2a + *
+L 1 + 2b + *
+L 1 + 2c + *
+L 1b + 2a + *
+L 1b + 2b + *
+L 1b + 2c + *
diff --git a/test/testdata/loop.gfa b/test/testdata/loop.gfa
new file mode 100644
index 0000000..1fa6e88
--- /dev/null
+++ b/test/testdata/loop.gfa
@@ -0,0 +1,10 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/sjackman/assembly-graph/raw/master/loop.gfa
+S 1 AAA
+S 2 ACG
+S 3 CAT
+S 4 TTT
+L 1 + 1 + 2M
+L 2 + 2 - 2M
+L 3 - 3 + 2M
+L 4 - 4 - 2M
diff --git a/test/testdata/sample.gfa b/test/testdata/sample.gfa
new file mode 100644
index 0000000..ff9c477
--- /dev/null
+++ b/test/testdata/sample.gfa
@@ -0,0 +1,12 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/sjackman/assembly-graph/blob/master/sample.gfa
+S 1 CGATGCAA
+S 2 TGCAAAGTAC
+S 3 TGCAACGTATAGACTTGTCAC RC:i:4
+S 4 GCATATA
+S 5 CGATGATA
+S 6 ATGA
+L 1 + 2 + 5M
+L 3 + 2 + 0M
+L 3 + 4 - 1M1D2M1S
+L 4 - 5 + 0M
diff --git a/test/testdata/spec_q1.gfa b/test/testdata/spec_q1.gfa
new file mode 100644
index 0000000..3dfec1b
--- /dev/null
+++ b/test/testdata/spec_q1.gfa
@@ -0,0 +1,8 @@
+H VN:Z:1.0
+S 1 AGCGTA
+S 2 TAACAG
+L 1 + 2 + 0M
+P A 1+,2+ 0M
+P B 1+,2+ 0M st:i:4 en:i:3
+P C 1+ * st:i:5
+P D 2+ * st:i:2 en:i:5
diff --git a/test/testdata/spec_q2.gfa b/test/testdata/spec_q2.gfa
new file mode 100644
index 0000000..48dee55
--- /dev/null
+++ b/test/testdata/spec_q2.gfa
@@ -0,0 +1,9 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/issues/23
+H co:Z:modified: the final "," of circular was eliminated
+S 1 AGCGTA
+S 2 TAACAG
+L 1 + 2 + 2M
+L 2 + 1 + 2M
+P linear 1+,2+ 2M
+P circular 1+,2+ 2M,2M
diff --git a/test/testdata/spec_q2.path_circular.seq b/test/testdata/spec_q2.path_circular.seq
new file mode 100644
index 0000000..bc4533e
--- /dev/null
+++ b/test/testdata/spec_q2.path_circular.seq
@@ -0,0 +1 @@
+AGCGTAAC
diff --git a/test/testdata/spec_q2.path_linear.seq b/test/testdata/spec_q2.path_linear.seq
new file mode 100644
index 0000000..3bc7a6f
--- /dev/null
+++ b/test/testdata/spec_q2.path_linear.seq
@@ -0,0 +1 @@
+AGCGTAACAG
diff --git a/test/testdata/spec_q3.gfa b/test/testdata/spec_q3.gfa
new file mode 100644
index 0000000..353ae78
--- /dev/null
+++ b/test/testdata/spec_q3.gfa
@@ -0,0 +1,13 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/issues/23#issuecomment-166806423
+H co:Z:is_this_okay is OK and shall be interpreted as a linear path
+S 1 AGCGTA
+S 2 TAACAG
+S 3 GTCATC
+L 1 + 2 + 2M
+L 2 + 3 + 2M
+L 3 + 1 + 2M
+P linear_path 1+,2+,3+ *,*
+P circular_path 1+,2+,3+ *,*,*
+P more_than_circular 1+,2+,3+,1+ *,*,0M
+P is_this_okay 1+,2+,3+ *
diff --git a/test/testdata/spec_q4.gfa b/test/testdata/spec_q4.gfa
new file mode 100644
index 0000000..7969297
--- /dev/null
+++ b/test/testdata/spec_q4.gfa
@@ -0,0 +1,14 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/issues/23#issuecomment-172869000
+H co:Z:similar to spec_q3
+H co:Z:more_than_circular should be invalid, 3+/1+/0M is not supported by a link
+S 1 AGCGTA
+S 2 TAACAG
+S 3 GTCATC
+L 1 + 2 + 2M
+L 2 + 3 + 2M
+L 3 + 1 + 2M
+P linear_path 1+,2+,3+ *,*
+P circular_path 1+,2+,3+ *,*,*
+P more_than_circular 1+,2+,3+,1+ *,*,0M
+P is_this_okay 1+,2+,3+ *
diff --git a/test/testdata/spec_q4.path_more_than_circular.seq b/test/testdata/spec_q4.path_more_than_circular.seq
new file mode 100644
index 0000000..c4f6e83
--- /dev/null
+++ b/test/testdata/spec_q4.path_more_than_circular.seq
@@ -0,0 +1 @@
+AGCGTAACAGCATCAGCGTA
diff --git a/test/testdata/spec_q5.gfa b/test/testdata/spec_q5.gfa
new file mode 100644
index 0000000..268a4bf
--- /dev/null
+++ b/test/testdata/spec_q5.gfa
@@ -0,0 +1,11 @@
+H ul:Z:https://github.com/pmelsted/GFA-spec/issues/8
+H co:Z:the last 3 links are equivalent to the first 3
+S read0 * LN:i:5500
+S read1 * LN:i:4000
+S read2 * LN:i:5500
+L read0 + read1 - 2000M
+L read1 - read2 + 3000M
+L read0 + read2 + 1000M
+L read1 + read0 - 2000M
+L read2 - read1 + 3000M
+L read2 - read0 - 1000M
diff --git a/test/testdata/spec_q6.gfa b/test/testdata/spec_q6.gfa
new file mode 100644
index 0000000..5c1c867
--- /dev/null
+++ b/test/testdata/spec_q6.gfa
@@ -0,0 +1,9 @@
+S 0 GAT TN:Z:Human
+S 1 CCC TN:Z:Mouse
+S 2 TAC TN:Z:Human,Mouse
+S 3 A TN:Z:Human
+S 4 TTA TN:Z:Mouse
+L 0 + 2 + 0M TN:Z:Human
+L 1 + 2 + 0M TN:Z:Mouse
+L 2 + 3 + 0M TN:Z:Human
+L 2 + 4 + 0M TN:Z:Mouse
diff --git a/test/testdata/spec_q7.gfa b/test/testdata/spec_q7.gfa
new file mode 100644
index 0000000..26d1955
--- /dev/null
+++ b/test/testdata/spec_q7.gfa
@@ -0,0 +1,9 @@
+H VN:Z:1.0
+H ul:Z:https://github.com/pmelsted/GFA-spec/issues/7#issuecomment-219685552
+S 11 ACCTT PG:J:{"Human":[{"target":"chr1","pos":1500,"strand":true}],"Mouse":[{"target":"chr2","pos":2000,"strand":false}],"ecoli":[{"target":"chr1","pos":2000,"strand":false},{"target":"chr1","pos":3000,"strand"=true}]}
+S 12 TCAAGG
+S 13 CTTGATT
+L 11 + 12 - 4M
+L 12 - 13 + 5M
+L 11 + 13 + 3M
+P 14 11+,12-,13+ 4M,5M
diff --git a/test/testdata/two_components.gfa b/test/testdata/two_components.gfa
new file mode 100644
index 0000000..4d9df58
--- /dev/null
+++ b/test/testdata/two_components.gfa
@@ -0,0 +1,11 @@
+H VN:Z:1.0
+S 1 * LN:i:1000
+S 2 * LN:i:1000
+S 3 * LN:i:1000
+S 4 * LN:i:1000
+S 5 * LN:i:1000
+S 6 * LN:i:1000
+L 1 + 2 + *
+L 1 + 3 + *
+L 1 + 4 + *
+L 5 + 6 + *
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/ruby-rgfa.git
More information about the debian-med-commit
mailing list