[med-svn] r105 - in trunk/packages/sigma-align/trunk: . debian

Charles Plessy charles-guest at costa.debian.org
Fri Sep 8 03:56:07 UTC 2006


Author: charles-guest
Date: 2006-09-08 03:56:06 +0000 (Fri, 08 Sep 2006)
New Revision: 105

Added:
   trunk/packages/sigma-align/trunk/sigma.ml
Modified:
   trunk/packages/sigma-align/trunk/debian/control
   trunk/packages/sigma-align/trunk/debian/rules
Log:
conforming to the ocaml policy

Modified: trunk/packages/sigma-align/trunk/debian/control
===================================================================
--- trunk/packages/sigma-align/trunk/debian/control	2006-09-05 15:40:25 UTC (rev 104)
+++ trunk/packages/sigma-align/trunk/debian/control	2006-09-08 03:56:06 UTC (rev 105)
@@ -3,12 +3,12 @@
 Priority: optional
 Maintainer: Debian-Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
 Uploaders: Charles Plessy <charles-debian-nospam at plessy.org>
-Build-Depends: debhelper (>= 5), ocaml-nox, xsltproc, docbook-xsl, docbook-xml
+Build-Depends: debhelper (>= 5), ocaml-best-compilers, xsltproc, docbook-xsl, docbook-xml
 Standards-Version: 3.7.2
 
 Package: sigma-align
 Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
+Depends: ocaml-base-nox-${F:OCamlABI}, ${shlibs:Depends}, ${misc:Depends}
 Description: Simple greedy multiple alignment of non-coding DNA sequences
  Sigma ("Simple greedy multiple alignment") is an alignment program with a new
  algorithm and scoring scheme designed specifically for non-coding DNA

Modified: trunk/packages/sigma-align/trunk/debian/rules
===================================================================
--- trunk/packages/sigma-align/trunk/debian/rules	2006-09-05 15:40:25 UTC (rev 104)
+++ trunk/packages/sigma-align/trunk/debian/rules	2006-09-08 03:56:06 UTC (rev 105)
@@ -13,7 +13,11 @@
 
 build: 
 	dh_testdir
-	ocamlopt str.cmxa sigma.ml -o sigma
+	if [ -x /usr/bin/ocamlopt ]; then \
+		ocamlopt str.cmxa sigma.ml -o sigma ; \
+	else \
+		ocamlc str.cma sigma.ml -o sigma ; \
+	fi
 	xsltproc -o debian/ -''-nonet /usr/share/xml/docbook/stylesheet/nwalsh/manpages/docbook.xsl debian/sigma.1.xml
 	touch build
 
@@ -34,18 +38,16 @@
 binary-arch: build install
 	dh_testdir
 	dh_testroot
-	dh_installchangelogs 
-	dh_installdocs
-	dh_installexamples
+	dh_installdocs sigma.ml
 	dh_install sigma usr/bin
 	dh_installman debian/sigma.1
 	dh_link
-	dh_strip
+#	dh_strip
 	dh_compress
 	dh_fixperms
 	dh_installdeb
 	dh_shlibdeps
-	dh_gencontrol
+	dh_gencontrol -s -- -VF:OCamlABI="$(OCAMLABI)"
 	dh_md5sums
 	dh_builddeb
 

Added: trunk/packages/sigma-align/trunk/sigma.ml
===================================================================
--- trunk/packages/sigma-align/trunk/sigma.ml	                        (rev 0)
+++ trunk/packages/sigma-align/trunk/sigma.ml	2006-09-08 03:56:06 UTC (rev 105)
@@ -0,0 +1,1071 @@
+(***  SiGMA : Simple Greedy Multiple Alignment
+ ***  Version 1.0 (March 11, 2006)
+ ***  Copyright (C) Rahul Siddharthan, 2006
+ ***
+ ***  Requires ocaml 3.x (http://caml.inria.fr)
+ ***  When compiling, link the str module as follows:
+ ***  bytecode compiler    -> ocamlc str.cma sigma.ml -o sigma.bytecode
+ ***  native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma
+ ***  
+ ***  (native is around 7x-10x faster, but no debugger support)
+ ***  
+ ***  To produce a static binary on linux (so that it's independent of
+ ***  your machine's particular version of libc), use
+ ***  ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma
+ ***) 
+
+(***  Strategy: find most significant common substring between two strings 
+ ***  (mutations may exist but are taken care of in the scoring) 
+ ***  and "align" them.  Then next largest common substring among 
+ ***  existing fragments that may be "consistently" aligned.   Each time
+ ***  a common substring is found it is "fused" into one fragment (with
+ ***  multiple associated sequence numbers).  Repeat until nothing left.
+ ***  
+ ***  Actually, for reasons of efficiency a list of "best matches" is made
+ ***  and aligned with consistency checks, rather than doing it one
+ ***  match at a time, but that's a detail.
+ ***)
+
+module IntSet = Set.Make (struct type t=int
+                                 let compare=compare end )
+
+(*** the main data type, the sequence fragment ***)
+type neigh_var = None | Frag of seq_frag
+and seq_frag = { header : string ; 
+                 seq : (int, string) Hashtbl.t ;
+                 mutable aseq: string;
+		 mutable abgw: float array;
+                 mutable seqindex : IntSet.t;
+                 seqlabel: (int, string) Hashtbl.t;
+                 neighbours: (int * char, neigh_var) Hashtbl.t;
+                 limits : (int * char, string) Hashtbl.t } 
+
+(***  Notes:
+ ***  
+ ***  1. seq is a hashtbl keyed by sequence index n, giving corresponding
+ ***  sequence, since after alignment there may be mismatches.
+ ***  
+ ***  2. aseq is a string giving the consensus of all aligned strings.
+ ***  For now, a mismatch will be marked as N and subsequent bases
+ ***  aligned to it will get a score zero.
+ ***  
+ ***  3.  seqindex is the original sequence to which the current sequence
+ ***  fragment belongs; a set containing initially a single number, but,
+ ***  as pairs are made, possibly multiple numbers
+ ***  
+ ***  4.  limits is a hashtbl keyed with pairs of sequence numbers and chars
+ ***  (n,c='l'/'r') mapping  to the legal start (l) or end (r) position on 
+ ***  sequence n for this fragment.
+ ***  
+ ***  5.  seqlabel is an array whose nth element is the label of that
+ ***  fragment on sequence n, a string representation of a float between 0
+ ***  and 1 (actually between 0 and 0.3).  Each fragment on each sequence
+ ***  will have a unique label.
+ ***  
+ ***  6.  neighbours is a hashtbl that takes as keys a pair of the
+ ***  sequence number and a char 'l'/'r' and outputs a ref to the
+ ***  neighbouring frag on that seq in that direction
+ ***  
+ ***  7. abgw is the background weight, the log of the (conditional)
+ ***  background probability for that base divided by 0.25.  
+ ***  
+ ***  The sequence starts in two datastructures, a list (whose first
+ ***  element is a dummy) and a set that contains all the "real" elements.
+ ***  At each alignment step two sequences (elements of the set) are
+ ***  broken up into 5 sequences (a shared portion and the non-shared
+ ***  boundaries, which may be zero-length).  The original list always
+ ***  contains the starting element, with pointer to the next fragment.
+ ***  It's like a doubly-linked list (actually, a set of lists with 
+ ***  crosslinks, if you like) with end fragments pointing to a 
+ ***  "dummy" fragment.
+ ***)
+
+type fasta_seq = { fheader : string; mutable fseq: Buffer.t}
+
+(*** Global variables ***)
+let prand = ref 0.002
+let correlbg = ref 2
+let bgfilename = ref ""
+let auxfilename = ref ""
+let singlesite = ref (Array.make 4 0.0)
+let dinucleotide = ref (Array.make_matrix 4 4 0.0)
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+(***  best_match: finds the most significant common substring with
+ ***  mismatches.  The significance is basically the background 
+ ***  probability of the string multiplied by the probability of
+ ***  seeing a match of that quality in two long strings of the given
+ ***  lengths.  The probability for fragment size l with string lengths
+ ***  l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1)
+ ***  where p = bg prob * (l choose m), where m = number of mismatches.
+ ***  Lowest answer = most significant.
+ ***)
+
+let best_match s1 s2 w1 w2 = 
+  let sl1 = String.length s1 in 
+  let sl2 = String.length s2 in
+  let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in
+  let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
+  let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
+  let best = ref 1.0 in
+  let bestm = ref 0 in
+  let bestn = ref 0 in
+  let lbest = ref 0 in
+    for m = 1 to sl1  do
+      for n = 1 to sl2  do 
+        let a1 = s1.[m-1] in
+        let b1 = s2.[n-1] in
+        let f_old = fmat.(m-1).(n-1) in
+        let m_old = mmat.(m-1).(n-1) in
+        let l_new = lmat.(m-1).(n-1) + 1 in
+        let m_new, f_new =
+          if (a1=b1) && (a1 != 'N') 
+          then 
+            m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1))
+              *. (float l_new) /. (float (l_new-m_old))
+          else  m_old + 1, f_old *. (float l_new) /. (float (m_old + 1))
+        in 
+        let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in
+          ( fmat.(m).(n) <- f_new;
+            lmat.(m).(n) <- l_new; 
+            mmat.(m).(n) <- m_new;
+	    if (f_new < !best) then
+	      ( best := f_new; bestm := m; bestn := n)
+          ) 
+      done
+    done;
+    while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0) 
+      && (!lbest < !bestm) && (!lbest < !bestn) do
+        lbest := !lbest + 1
+    done;
+    best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1));
+    if (!best < !prand)
+    then  !bestm - !lbest, !bestn - !lbest, !lbest, !best
+    else 0,0,0,1.0
+;;
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+let compareSeq seq1 seq2 =
+  (***  if seqindex is same in both cases then check fragment number
+   ***  from "seqlabel", otherwise IntSet.compare *)
+  (***  this is needed to create the module SeqSet, but isn't used,
+   ***  except to the extent that this should not return 0 unless
+   ***  two frags really are identical, which should not happen *)
+    match (IntSet.compare seq1.seqindex seq2.seqindex ) with
+        0 -> ( let rec complabel = function
+                 |  [] -> 0
+                 | a::t -> 
+                     match (compare (Hashtbl.find seq1.seqlabel a)
+                                    (Hashtbl.find seq2.seqlabel a)) with
+                         0 -> complabel t
+                       | n -> n 
+               in complabel (IntSet.elements seq1.seqindex))
+      | n -> n ;;
+
+
+module SeqSet = Set.Make (struct type t=seq_frag
+                                 let compare=compareSeq end );;
+
+let seqset = ref SeqSet.empty ;;
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+let readlines_noblank ffilename = (* works like python's readlines *)
+  let fchan = open_in ffilename in
+  let lenstr = in_channel_length fchan in
+  let res = String.create lenstr in 
+    really_input fchan res 0 lenstr ;
+    Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;;
+
+let newfseq h =   { header = h;
+                    seq = Hashtbl.create 5;
+		    aseq = "" ;
+		    abgw = Array.create 0 0.0;
+                    seqindex = IntSet.empty; 
+                    seqlabel = Hashtbl.create 5;
+                    neighbours = Hashtbl.create 10;
+                    limits = Hashtbl.create 5 };;
+
+let readfasta filename = (*   reads a fasta sequence, returns it as a list 
+                          ***  with a dummy first element used as a marker *)
+  let flines = readlines_noblank filename in
+  let rec flines2flist flist flines fcurr ln =
+    if List.length flines = 0 then
+      fcurr::flist
+    else
+      match (List.hd flines).[0] with
+          '>' -> 
+            let fcurr2 = newfseq (List.hd flines)
+            in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1)
+        | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln
+        | _ -> 
+            let fcurr2 = { header = fcurr.header; seq=fcurr.seq;
+                           aseq = fcurr.aseq ^ (String.lowercase 
+                                                  (List.hd flines)) ;
+			   abgw = Array.create 0 0.0;
+                           seqlabel= fcurr.seqlabel ;
+                           neighbours = fcurr.neighbours;
+                           seqindex = fcurr.seqindex;
+                           limits = fcurr.limits
+                         } 
+            in flines2flist flist (List.tl flines) fcurr2 ln
+  in 
+    List.rev (flines2flist [] flines (newfseq "" ) (-1));;
+
+
+let makefastaset seqlist =
+  let rec list2set a s =
+    match a with 
+        [] -> s
+      | h::t -> list2set t (SeqSet.add h s)
+  in list2set (List.tl seqlist) (SeqSet.empty);;
+
+
+let basenum  = function
+  | 'a' | 'A' -> 0
+  | 'c' | 'C' -> 1
+  | 'g' | 'G' -> 2
+  | 't' | 'T' -> 3
+  | _ -> -1
+;;
+
+
+let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide
+				       from bgfile *)
+  let bglines = readlines_noblank bgfile in
+  let line2val line1 =
+    let line1s=Str.split (Str.regexp "[ \t]+") line1 in
+      if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3)
+      then 
+	let bs = List.hd line1s in
+	  if (String.length bs = 1) then
+	    !singlesite.(basenum bs.[0]) <-
+	      (float_of_string (List.nth line1s 1))
+	else 
+	  !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <-
+	    (float_of_string (List.nth line1s 1))
+      else ()
+  in 
+    List.iter line2val bglines;;
+
+
+let getbgprobs auxseqlist =
+  (*** assume !singlesite, !dinucleotide are initialised to zero ***)
+  List.iter 
+    (fun f -> 
+       for n = 0 to (String.length f.aseq) - 1 do
+         let c = basenum f.aseq.[n] in
+           if c >= 0 then 
+             (!singlesite.(c) <- !singlesite.(c) +. 1.0;
+              !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0;
+              if n>0 then
+                let c1 = basenum f.aseq.[n-1] in
+                  if c1 >=0 then
+                    (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0;
+                    )
+             )
+       done 
+    ) auxseqlist 
+;;
+
+
+let normalise_bgprobs () =
+  (let tots = ref 0.0 in
+     (for n = 0 to 3 do
+        tots := !tots +. !singlesite.(n)
+      done; 
+      for n = 0 to 3 do
+        !singlesite.(n) <- !singlesite.(n) /. !tots
+      done
+     )
+  );
+  for n = 0 to 3 do 
+    let totd = ref 0.0 in
+      (for m = 0 to 3 do
+         totd := !totd +. !dinucleotide.(n).(m)
+       done;
+       for m = 0 to 3 do
+         !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd
+       done
+      )
+  done 
+;;
+
+let getwts s =
+  let w = Array.make (String.length s) 0.0 in
+    ( for n = 0 to (String.length s) - 1 do
+        if !correlbg = 0 then w.(n) <- 0.25
+        else 
+          let cn = basenum s.[n] in
+            if cn<0 then w.(n) <- 1.
+            else if n=0 then w.(n) <- !singlesite.(cn)
+            else
+              let cn1 = basenum s.[n-1] in
+                if cn1 < 0 || !correlbg = 1 
+	        then w.(n) <- !singlesite.(cn)
+                else w.(n) <- !dinucleotide.(cn1).(cn)
+      done;
+      w
+    )
+;;
+
+
+let filename2seqlist filenameset gotbgprobs =
+  let fastalist1 =
+    try 
+      readfasta (List.hd filenameset)
+    with 
+	_ -> ( Printf.fprintf stderr 
+                 "Error: unable to read file %s\n" (List.hd filenameset);
+               exit 1)
+  in     
+  let fastalist2l =
+    List.map 
+      (fun filename ->
+         try 
+           readfasta filename
+         with 
+             _ -> ( Printf.fprintf stderr 
+                      "Error: unable to read file %s\n" filename;
+                    exit 1)
+      ) (List.tl filenameset)
+  in
+  let makeseqlist = function
+    | [] -> []
+    | h::t ->
+        List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t
+  in
+  let fastalist = fastalist1 @ (makeseqlist fastalist2l) in
+    ( if gotbgprobs then () else
+	( getbgprobs fastalist;
+          normalise_bgprobs () 
+	);
+      List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist;
+      fastalist
+    )
+;;
+
+
+
+let rec blockseq seql seqnumset maxl maxw =
+  (***  the main routine, recursively forms blocks until no more are
+	***  possible *)
+  (***  first, make a list of best pairwise matches, removing matches
+	***  that conflict with better matches; then pairwise-align those, 
+	***  one by one; then call self again.  If null list, quit. *)
+
+  let seqhead = List.hd seql in
+  let seqlist = List.tl seql in
+
+  let find_neigh = function
+    | None -> seqhead 
+    | Frag a -> a
+  in 
+
+  let update_limits seqfrag = (* updates limits for consistent alignment *)
+    let rec sweep_limits frag l_stop r_start m =
+      let lim = seqfrag.limits in
+        if frag==seqhead then () 
+        else
+          let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in
+          let thislabel = Hashtbl.find frag.seqlabel m in
+          let fix_limits n =
+            ( ( if thislabel <= l_stop then
+                  let newlim = 
+		    try Hashtbl.find lim (n,'r') 
+		    with _ -> "0.3"
+		  in
+		  let oldlim = 
+		    try Hashtbl.find frag.limits (n,'r')
+		    with _ -> "0.3"
+		  in 
+                    if oldlim > newlim
+                    then Hashtbl.replace frag.limits (n,'r') newlim);
+              ( if thislabel >= r_start then
+                  let newlim = 
+		    try Hashtbl.find lim (n,'l') 
+		    with _ -> "0"
+		  in
+		  let oldlim =
+		    try Hashtbl.find frag.limits (n,'l')
+		    with _ -> "0"
+		  in 
+                    if oldlim < newlim
+                    then Hashtbl.replace frag.limits (n,'l') newlim))
+          in 
+            (IntSet.iter fix_limits seqnumset;
+             sweep_limits nextfrag l_stop r_start m)
+
+    in (
+        IntSet.iter  (* Set "diagonal" limits to self *)
+          (fun n -> 
+             (Hashtbl.replace seqfrag.limits (n,'l') 
+		(Hashtbl.find seqfrag.seqlabel n) ;
+              Hashtbl.replace seqfrag.limits (n,'r') 
+		(Hashtbl.find seqfrag.seqlabel  n)))
+          seqfrag.seqindex;
+        IntSet.iter (*   replace each limit with most stringent limit of
+                     ***  a neighbouring fragment *)
+          (fun n -> 
+             ( IntSet.iter 
+                 (fun m ->
+                    (let fragl = find_neigh 
+		       (Hashtbl.find seqfrag.neighbours (m,'l')) in
+                     let limn = 
+		       try Hashtbl.find fragl.limits (n,'l') 
+		       with _ -> "0"
+		     in
+		     let seqfraglim = 
+		       try Hashtbl.find seqfrag.limits (n,'l')
+		       with _ -> "0"
+		     in 
+                       if limn > seqfraglim
+                       then Hashtbl.replace seqfrag.limits (n,'l') limn );
+                    (let fragr = find_neigh 
+		       (Hashtbl.find seqfrag.neighbours (m,'r')) in
+                     let limn = 
+		       try Hashtbl.find fragr.limits (n,'r') 
+		       with _ -> "0.3" 
+		     in
+		     let seqfraglim = 
+		       try Hashtbl.find seqfrag.limits (n,'r') 
+		       with _ -> "0.3"
+		     in 
+                       if limn < seqfraglim
+                       then Hashtbl.replace seqfrag.limits (n,'r') limn )
+                 ) seqfrag.seqindex )
+          ) seqnumset;
+        let do_sweep n =
+          let initfrag = (List.nth seqlist n) in
+	  let llimit =
+	    try Hashtbl.find seqfrag.limits (n,'l')
+	    with _ -> "0"
+	  in 
+	  let rlimit = 
+	    try Hashtbl.find seqfrag.limits (n,'r')
+	    with _ -> "0.3"
+	  in 
+            sweep_limits initfrag llimit rlimit n
+        in 
+          IntSet.iter do_sweep seqnumset
+      )
+  in 
+
+  let inconsistent_frags frag1 frag2  =
+    let rec inconsistent_indices indexlist1 indexlist2 =
+      (***  check that the frags don't lie outside allowed ranges for
+	    ***  one another *)
+      let rec inconsistent_indices_onepair index1 indexlist2 =
+        match indexlist2 with
+            [] -> false
+          | h::t -> 
+	      let lim2l =
+		try Hashtbl.find frag2.limits (index1,'l') 
+		with _ -> "0"
+	      in 
+	      let lim2r =
+		try Hashtbl.find frag2.limits (index1,'r') 
+		with _ -> "0.3"
+	      in 
+	      let lim1l = 
+		try Hashtbl.find frag1.limits (h,'l')
+		with _ -> "0"
+	      in 
+	      let lim1r =
+		try Hashtbl.find frag1.limits (h,'r')
+		with _ -> "0.3"
+	      in 
+                if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0) 
+                  || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0)
+                  || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0)
+                  || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0)
+                then true
+                else inconsistent_indices_onepair index1 t
+      in 
+        match indexlist1 with
+            [] -> false
+          | h::t -> 
+              if inconsistent_indices_onepair h indexlist2
+              then true
+              else inconsistent_indices t indexlist2
+    in inconsistent_indices (IntSet.elements frag1.seqindex) 
+         (IntSet.elements frag2.seqindex) 
+  in 
+
+
+  let rec make_block_list fraglist outlist =
+    (***  make a list of best matches, to be traversed by fuse_block_list *)
+    (***  note: at least one of the frags being paired must have 
+	  ***  "paired"=true *)
+    let rec one_block frag rest_of_list outlist = 
+      match rest_of_list with
+          [] -> outlist
+        | a::t -> 
+	    if 
+	      (frag.aseq != "") && (a.aseq != "")	      
+            then 
+	      let n1,n2,l,p = 
+		best_match frag.aseq a.aseq frag.abgw a.abgw
+	      in
+                if p<1.0 then
+		  let s1 = IntSet.elements frag.seqindex in
+		  let s2 = IntSet.elements a.seqindex in
+		    one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist)
+                else 
+                  one_block frag t outlist
+	    else 
+	      one_block frag t outlist
+    in 
+      match fraglist with
+          [] -> outlist
+        | a::t -> 
+	    let t1 = 
+	      List.filter (
+		fun f -> not (inconsistent_frags a f)) t in
+	      make_block_list t (one_block a t1 outlist)
+  in 
+
+
+  let fuse_block_list blocklist =
+
+    let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
+      if n1 >= (String.length frag1.aseq) 
+      then
+        let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r'))
+        in
+          firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1,
+                     frag2,n2,s2,l)
+      else if n2 >= (String.length frag2.aseq)
+      then
+        let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r'))
+        in
+          firstfrag (p,frag1,n1,s1,
+                     frag2b,n2-(String.length frag2.aseq), s2,l)
+      else (p,frag1,n1,s1,frag2,n2,s2,l)
+    in 
+
+    let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
+      let f1gap = ((String.length frag1.aseq) - n1) in
+      let f2gap = ((String.length frag2.aseq) - n2) in
+	if (f1gap < f2gap) then
+	  let frag1b = 
+	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
+	  let lb = l - f1gap in
+	    (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
+	    (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb)
+	else if (f1gap > f2gap) then 
+	  let frag2b =
+	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
+	  let lb = l - f2gap in
+	    (p,frag1,n1,s1,frag2,n2,s2,f2gap), 
+	    (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb)
+	else
+	  let frag1b = 
+	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
+	  let frag2b = 
+	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
+	  let lb = l - f1gap in
+            (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
+            (p,frag1b,0,s1,frag2b,0,s2,lb)
+    in 
+
+    let rec expand_block pair1 outlist =
+      let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in
+        (***  if the block, as annotated, is no longer in one fragment because
+              ***  of other fusings, then convert this block into a list of
+              ***  single-fragment blocks *)
+        if inconsistent_frags frag1 frag2
+        then []
+        else
+          if String.length (frag1.aseq) > n1+l-1 &&
+            String.length (frag2.aseq) > n2+l-1 
+          then (pair1::outlist)
+          else 
+            if frag1=seqhead || frag2=seqhead then
+              (print_endline "problem";
+               outlist)
+            else
+              let pair1,pair2 = nextfrag pair1 in
+                expand_block pair2 (pair1::outlist)
+    in 
+
+    let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) =
+      let consensus s1 s2 w1 w2 =
+        for i = 0 to (String.length s1) -1 do
+          if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.)
+        done;
+        s1, w1
+      in 
+	(***  fuse the two frags; it is assumed that expand_block has
+              ***  already been called so n1+l and n2+l won't exceed the length
+              ***  of the frag *)
+	(***  divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5
+              ***  where frag3 is the "shared piece" so the original frags are
+              ***  split into frag1->frag3->frag4 and frag2->frag3->frag5 *)
+
+	(
+          let tempstr1=String.copy frag1.aseq in
+          let tempstr2=String.copy frag2.aseq in 
+	  let tempbgw1=Array.copy frag1.abgw in
+	  let tempbgw2=Array.copy frag2.abgw in 
+	    (
+              frag1.aseq <- String.sub tempstr1 0 n1 ;
+              frag2.aseq <- String.sub tempstr2 0 n2 ;
+	      frag1.abgw <- Array.sub tempbgw1 0 n1;
+	      frag2.abgw <- Array.sub tempbgw2 0 n2;
+              let frag3 = 
+		let as3, aw3 = 
+		  consensus (String.sub tempstr1 n1 l) 
+		    (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l) 
+		    (Array.sub tempbgw2 n2 l)
+		in 
+		  { header = ""; 
+                    aseq = as3;
+		    abgw = aw3;
+		    seq = Hashtbl.copy frag1.seq;
+                    seqindex=IntSet.union frag1.seqindex frag2.seqindex;
+                    seqlabel= Hashtbl.copy frag1.seqlabel;
+                    neighbours=Hashtbl.copy frag1.neighbours;
+                    limits = Hashtbl.copy frag1.limits
+                  } in 
+              let frag4 = { header = ""; 
+                            aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l);
+			    abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l);
+			    seq = Hashtbl.copy frag1.seq;
+                            seqindex= IntSet.union frag1.seqindex IntSet.empty;
+                            seqlabel= Hashtbl.copy frag1.seqlabel;
+                            neighbours= Hashtbl.copy frag1.neighbours;
+                            limits = Hashtbl.copy frag1.limits
+                          } in 
+              let frag5 = { header = ""; 
+                            aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l);
+                            abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l);
+			    seq = Hashtbl.copy frag2.seq;
+                            seqindex=IntSet.union frag2.seqindex IntSet.empty;
+                            seqlabel= Hashtbl.copy frag2.seqlabel;
+                            neighbours= Hashtbl.copy frag2.neighbours;
+                            limits = Hashtbl.copy frag2.limits
+                          } 
+              in (
+                  IntSet.iter 
+                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
+				 (Frag frag1);
+                               Hashtbl.replace frag3.neighbours (n,'r') 
+				 (Frag frag4);
+                               Hashtbl.replace frag1.neighbours (n,'r') 
+				 (Frag frag3);
+                               Hashtbl.replace frag4.neighbours (n,'l') 
+				 (Frag frag3);
+
+			       Hashtbl.replace frag3.seq n
+				 (String.sub (Hashtbl.find frag1.seq n) n1 l);
+			       Hashtbl.replace frag4.seq n
+				 (String.sub (Hashtbl.find frag1.seq n) 
+				    (n1+l) ((String.length tempstr1)-n1-l));
+			       Hashtbl.replace frag1.seq n
+				 (String.sub (Hashtbl.find frag1.seq n) 0 n1);
+
+                               Hashtbl.replace frag3.seqlabel n 
+				 ((Hashtbl.find frag1.seqlabel n) ^ "1");
+                               Hashtbl.replace frag4.seqlabel n  
+				 ((Hashtbl.find frag4.seqlabel n) ^ "2");
+                               Hashtbl.replace frag1.seqlabel n  
+				 ((Hashtbl.find frag1.seqlabel n) ^ "0");
+                              )) frag1.seqindex;
+                  IntSet.iter 
+                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
+				 (Frag frag2);
+                               Hashtbl.replace frag3.neighbours (n,'r') 
+				 (Frag frag5);
+                               Hashtbl.replace frag2.neighbours (n,'r') 
+				 (Frag frag3);
+                               Hashtbl.replace frag5.neighbours (n,'l') 
+				 (Frag frag3);
+
+			       Hashtbl.replace frag3.seq n
+				 (String.sub (Hashtbl.find frag2.seq n) n2 l);
+			       (try
+				  Hashtbl.replace frag5.seq n
+				    (String.sub (Hashtbl.find frag2.seq n) 
+				       (n2+l) ((String.length tempstr2)-n2-l))
+				with
+				    _ -> 
+				      print_endline (Hashtbl.find frag2.seq n)
+			       ); 
+			       Hashtbl.replace frag2.seq n
+				 (String.sub (Hashtbl.find frag2.seq n) 0 n2);
+
+                               Hashtbl.replace frag3.seqlabel n 
+				 ((Hashtbl.find frag2.seqlabel n) ^ "1");
+                               Hashtbl.replace frag5.seqlabel n 
+				 ((Hashtbl.find frag5.seqlabel n) ^ "2");
+                               Hashtbl.replace frag2.seqlabel n 
+				 ((Hashtbl.find frag2.seqlabel n) ^ "0")
+                              )) frag2.seqindex;
+
+                  seqset := SeqSet.add frag3 !seqset;
+                  seqset := SeqSet.add frag4 !seqset;
+                  seqset := SeqSet.add frag5 !seqset;
+                  update_limits frag3;
+		)
+            )
+	)
+    in 
+
+    let fuse_one_frag f = 
+      let expanded = expand_block (firstfrag f) [] in
+        if (List.for_all 
+              (fun (p,frag1,n1,s1,frag2,n2,s2,l) ->
+                 not (inconsistent_frags frag1 frag2))
+              expanded) 
+        then 
+          List.iter fuse_one_block expanded
+    in 
+      List.iter fuse_one_frag blocklist
+  in 
+  let fraglist = SeqSet.elements !seqset in
+  let blocklist =List.sort compare (make_block_list fraglist []) in
+    match blocklist with 
+        [] -> []
+      | _ -> (fuse_block_list blocklist;
+	      let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in
+		blockseq seql seqnumset maxl maxw)
+;;
+
+(***                                                              ***)
+(***  Create aligned sequence out of the bunch of fragments       ***)
+(***  produced by blockseq                                        ***)
+(***                                                              ***)
+
+
+let rec assemble_frags_to_seqs seqhead seqlist foutlist  =
+
+  let is_complete frag = (*   if frag is multi-sequence, all copies
+                          *** are at head of seqlist *)
+    if frag == seqhead then false else
+    if IntSet.cardinal frag.seqindex = 1 then true else
+      let nfragmatches =
+        List.fold_left 
+          (fun n fr -> 
+	     if IntSet.compare frag.seqindex fr.seqindex = 0
+             then n+1 else n) 0 seqlist
+      in 
+        if nfragmatches = IntSet.cardinal frag.seqindex
+        then true else false
+  in 
+
+  let no_other_blocks frag = (*   No other complete frag has a smaller
+                              *** seqindex in seqlist *)
+    if IntSet.cardinal frag.seqindex = 1 then true else
+      List.fold_left 
+        (fun b fr -> 
+           if (IntSet.cardinal fr.seqindex > 0) &&
+             (IntSet.compare fr.seqindex frag.seqindex = -1) &&
+             is_complete fr
+           then false else b) true seqlist
+  in 
+
+  let maxlen fragwidth seqindex = (*    For a complete multi-sequence
+                                   ***  frag, it must be aligned when
+                                   ***  tacked on, and can't overlap an
+                                   ***  existing sequence; so find
+                                   ***  "padding" length *)
+    let maxlen_blockseqs =
+      IntSet.fold (fun idx n ->
+                     let fout = (List.nth foutlist idx) in
+                       if Buffer.length fout.fseq > n
+                       then Buffer.length fout.fseq else n) seqindex 0
+    in 
+    let push_up fs m =
+      let m1 = ref m in
+        (while 
+           (fs.[!m1] =Char.lowercase fs.[!m1]) do
+           m1 := !m1 + 1
+         done; 
+         while (!m1 < String.length fs)
+           &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do
+           m1 := !m1 + 1
+         done; 
+         !m1)
+    in 
+    let rec overlap fs m1 n =
+      if n = fragwidth then false
+      else if (m1+n) >= String.length fs then false
+      else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true
+      else overlap fs m1 (n+1)
+    in 
+    let mx = ref maxlen_blockseqs in
+      (  for n = 0 to (List.length foutlist - 1) do
+         let fs = Buffer.contents (List.nth foutlist n).fseq in
+           (while overlap fs !mx 0 do
+              mx := push_up fs !mx
+            done ) 
+       done;
+       !mx )
+  in 
+
+  let find_neigh neigh1 =
+    match neigh1 with
+        None -> seqhead 
+      | Frag a -> a
+  in 
+
+  let rec process_frag_list seql foutl newseqlist n oldmaxlen =
+    match seql with
+        [] -> List.rev newseqlist
+      | frag::seqt ->
+	  let fragseq = Hashtbl.find frag.seq n in
+          let fout = List.hd foutl in
+            if (is_complete frag) && (no_other_blocks frag) then
+              ( if IntSet.cardinal frag.seqindex = 1 then
+                  (Buffer.add_string fout.fseq fragseq;
+                   let nextfrag = find_neigh 
+		     (Hashtbl.find frag.neighbours (n,'r')) in
+                     process_frag_list seqt (List.tl foutl) 
+                       (nextfrag::newseqlist) (n+1) oldmaxlen
+                  )
+                else (
+                  if oldmaxlen = 0 && 
+                    (Buffer.length fout.fseq > 0 || 
+                     n = List.hd (IntSet.elements frag.seqindex)) then (
+                    (*** Find padding value, unless we're just starting ***)
+                    (*** If just starting, make sure this is the first  ***)
+                    (*** in the block                                   ***)
+                    let newmaxlen = (maxlen (String.length fragseq) 
+                                       frag.seqindex) in
+                      while Buffer.length fout.fseq < newmaxlen do
+                        Buffer.add_char fout.fseq '-'
+                      done
+                  ) else
+                    while Buffer.length fout.fseq < oldmaxlen do
+                      Buffer.add_char fout.fseq '-'
+                    done;
+                  Buffer.add_string fout.fseq (String.uppercase fragseq);
+                  let nextfrag 
+		      = find_neigh (Hashtbl.find frag.neighbours (n,'r'))
+                  in 
+                    process_frag_list seqt (List.tl foutl) 
+                      (nextfrag::newseqlist) (n+1)
+                      ((Buffer.length fout.fseq) - (String.length fragseq))
+                )
+              )
+            else 
+              process_frag_list seqt (List.tl foutl)
+                (frag::newseqlist) (n+1) oldmaxlen
+  in 
+  let newseqlist = process_frag_list seqlist foutlist [] 0 0
+  in 
+  let end_list =
+    List.fold_left 
+      (fun b fr -> if fr==seqhead then b else false) true newseqlist
+  in
+    if end_list
+    then foutlist
+    else 
+      assemble_frags_to_seqs seqhead newseqlist foutlist
+;;
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+let print_aligned foutlist = 
+  let print_one_line foutlist n poslist =
+    let print_one_line_seq fout pos =
+      let newpos = ref pos in
+        for m = 0 to 49 do 
+          ( ( if m mod 50 = 0 then
+                let head = 
+                    if String.length fout.fheader < 11 
+                    then String.sub fout.fheader 1 
+                      (String.length fout.fheader -1)
+                    else String.sub fout.fheader 1 10 
+                in 
+                  ( print_string head;
+                    (if (String.length head < 10) then
+                       print_string 
+                         (String.make (10-(String.length head)) ' '));
+                    print_string "  "
+                  )
+              else if ((m) mod 10) = 0 then 
+                print_string " "
+            );
+            (let ch = 
+               if n+m < Buffer.length fout.fseq
+               then (Buffer.contents fout.fseq).[m+n] 
+               else ' ' in
+               (print_char ch;
+                if ch = '-' or ch = ' ' then
+                  ()
+                else 
+                  newpos := !newpos+1
+               )
+            );
+            if ((m+1) mod 50) = 0 then (
+              print_string "  ";
+              print_int !newpos;
+              print_string "\n";
+            )) 
+        done;
+        !newpos
+    in 
+      List.map2 print_one_line_seq foutlist poslist
+  in              
+    
+  let rec print_aligned foutlist n poslist =
+    (print_endline "";
+     if (List.fold_left (fun b f -> 
+                           if n < (Buffer.length f.fseq) then true else b) 
+           false foutlist) then 
+       print_aligned foutlist (n+50) (print_one_line foutlist n poslist)
+     else () 
+    )
+  in 
+  let poslist = List.map (fun f -> 0) foutlist
+  in print_aligned foutlist 0 poslist;;
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+let print_fasta foutlist =
+  List.iter 
+    (fun f ->
+       print_endline f.fheader;
+       print_endline (Buffer.contents f.fseq);
+       print_string "\n"
+    ) foutlist ;;
+
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+(***                                                              ***)
+
+(***  main program below *)
+
+let filenameset = ref [] ;;
+let outtype = ref "";;
+
+let argspeclist = 
+  [("-A", Arg.Unit 
+      (fun () -> outtype :=  !outtype^"A"), 
+      "aligned output (compare -F option) (default: only this)");
+   ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which "
+      ^ "to read background sequences (overridden by -B)");
+   ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities");
+   ("-F", Arg.Unit 
+      (fun () -> outtype := !outtype^"F"), 
+      "fasta output (can use both -A and -F in either order)");
+   ("-n", Arg.Set_int correlbg, "background correlation (default 2="
+      ^ "dinucleotide;\n     1=single-site basecounts, 0=0.25 per base)");
+   ("-x", Arg.Set_float prand, "set limit for how probable the match "
+      ^ "is by chance\n     (default 0.002, smaller=more stringent)")]
+in 
+let message = 
+  String.concat ""
+    ["Sigma, version 1.0: simple greedy multiple alignment\n";
+     "Copyright (C) 2006 Rahul Siddharthan <rsidd at imsc.res.in>\n";
+     "Licensed under the GNU General Public License, version 2\n\n";
+     "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n";
+     "Each fasta file may contain a single sequence or multiple sequences;\n";
+     "all sequences will be aligned together.\n\n";
+     "Options:"]
+in                 
+  Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;;
+
+
+if !outtype = "" then outtype := "A";;
+
+
+if !filenameset = [] then
+  (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n";
+   exit 1);;
+
+
+if !correlbg > 2 then 
+  (print_endline "-n > 2 not implemented; using 2";
+   correlbg := 2)
+else if !correlbg < 0 then
+  (print_endline "-n should be >= 0; assuming 0";
+   correlbg := 0)
+;;
+
+let gotbgprobs = ref (!correlbg = 0);;
+
+if !bgfilename > "" then
+  try 
+    ( getbgprobs_from_bgfile !bgfilename;
+      normalise_bgprobs ();
+      gotbgprobs := true 
+    )
+  with _ -> 
+    Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename
+;;
+
+if (not !gotbgprobs) then (
+  let auxseqlist = 
+    match !auxfilename with
+	"" -> []
+      | _ -> 
+          try readfasta !auxfilename with
+              _ -> (Printf.fprintf stderr 
+		      "Couldn't open file %s : using input sequences\n" 
+		      !auxfilename;
+                    [])
+  in 
+    if auxseqlist != [] then
+      ( getbgprobs auxseqlist;
+        normalise_bgprobs () ;
+	gotbgprobs := true
+      )
+)
+;;
+
+let seqlist = filename2seqlist !filenameset !gotbgprobs ;;
+
+match List.length seqlist with
+    1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1)
+  | 2 -> (Printf.fprintf stderr 
+            "Only one sequence read, no alignment needed\n"; exit 0)
+  | _ -> ()
+;;
+
+let nel=(List.length seqlist) - 2 in
+  for n = 0 to nel do
+    let h = List.nth seqlist (n+1) in
+      ( h.seqindex <- IntSet.add n IntSet.empty ;
+        Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist));
+        Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist));
+        Hashtbl.replace h.seqlabel n "0.";
+        Hashtbl.replace (List.hd seqlist).seqlabel n "-1";
+        Hashtbl.replace h.seq n (String.copy h.aseq);
+	(* h.abgw <- getwts h.aseq; *)
+	(* h.abgw <- Array.create (String.length h.aseq) 0.0; *)
+        Hashtbl.replace (List.hd seqlist).seq n "";
+      )
+  done;;
+
+seqset := makefastaset seqlist;
+
+let seqnumset =
+  let rec range n r = 
+    if n<0 then r else range (n-1) (IntSet.add n r)
+  in range (List.length seqlist - 2) IntSet.empty
+in blockseq seqlist seqnumset 10000 0.;;
+
+let fout = List.map (fun fr -> {fheader=fr.header; 
+                                fseq = Buffer.create 1000} ) 
+             (List.tl seqlist) ;;
+
+assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;;
+
+String.iter (fun c -> if c='A' then print_aligned fout else
+               print_fasta fout) !outtype;;




More information about the debian-med-commit mailing list