[med-svn] r251 - in trunk/packages/sigma-align/trunk: . debian

Charles Plessy charles-guest at alioth.debian.org
Sat Apr 7 06:46:41 UTC 2007


Author: charles-guest
Date: 2007-04-07 06:46:41 +0000 (Sat, 07 Apr 2007)
New Revision: 251

Removed:
   trunk/packages/sigma-align/trunk/debian/lintian-override/
   trunk/packages/sigma-align/trunk/sigma.ml
Log:
purging ocaml tings

Deleted: trunk/packages/sigma-align/trunk/sigma.ml
===================================================================
--- trunk/packages/sigma-align/trunk/sigma.ml	2007-04-07 06:44:15 UTC (rev 250)
+++ trunk/packages/sigma-align/trunk/sigma.ml	2007-04-07 06:46:41 UTC (rev 251)
@@ -1,1071 +0,0 @@
-(***  SiGMA : Simple Greedy Multiple Alignment
- ***  Version 1.0 (March 11, 2006)
- ***  Copyright (C) Rahul Siddharthan, 2006
- ***
- ***  Requires ocaml 3.x (http://caml.inria.fr)
- ***  When compiling, link the str module as follows:
- ***  bytecode compiler    -> ocamlc str.cma sigma.ml -o sigma.bytecode
- ***  native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma
- ***  
- ***  (native is around 7x-10x faster, but no debugger support)
- ***  
- ***  To produce a static binary on linux (so that it's independent of
- ***  your machine's particular version of libc), use
- ***  ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma
- ***) 
-
-(***  Strategy: find most significant common substring between two strings 
- ***  (mutations may exist but are taken care of in the scoring) 
- ***  and "align" them.  Then next largest common substring among 
- ***  existing fragments that may be "consistently" aligned.   Each time
- ***  a common substring is found it is "fused" into one fragment (with
- ***  multiple associated sequence numbers).  Repeat until nothing left.
- ***  
- ***  Actually, for reasons of efficiency a list of "best matches" is made
- ***  and aligned with consistency checks, rather than doing it one
- ***  match at a time, but that's a detail.
- ***)
-
-module IntSet = Set.Make (struct type t=int
-                                 let compare=compare end )
-
-(*** the main data type, the sequence fragment ***)
-type neigh_var = None | Frag of seq_frag
-and seq_frag = { header : string ; 
-                 seq : (int, string) Hashtbl.t ;
-                 mutable aseq: string;
-		 mutable abgw: float array;
-                 mutable seqindex : IntSet.t;
-                 seqlabel: (int, string) Hashtbl.t;
-                 neighbours: (int * char, neigh_var) Hashtbl.t;
-                 limits : (int * char, string) Hashtbl.t } 
-
-(***  Notes:
- ***  
- ***  1. seq is a hashtbl keyed by sequence index n, giving corresponding
- ***  sequence, since after alignment there may be mismatches.
- ***  
- ***  2. aseq is a string giving the consensus of all aligned strings.
- ***  For now, a mismatch will be marked as N and subsequent bases
- ***  aligned to it will get a score zero.
- ***  
- ***  3.  seqindex is the original sequence to which the current sequence
- ***  fragment belongs; a set containing initially a single number, but,
- ***  as pairs are made, possibly multiple numbers
- ***  
- ***  4.  limits is a hashtbl keyed with pairs of sequence numbers and chars
- ***  (n,c='l'/'r') mapping  to the legal start (l) or end (r) position on 
- ***  sequence n for this fragment.
- ***  
- ***  5.  seqlabel is an array whose nth element is the label of that
- ***  fragment on sequence n, a string representation of a float between 0
- ***  and 1 (actually between 0 and 0.3).  Each fragment on each sequence
- ***  will have a unique label.
- ***  
- ***  6.  neighbours is a hashtbl that takes as keys a pair of the
- ***  sequence number and a char 'l'/'r' and outputs a ref to the
- ***  neighbouring frag on that seq in that direction
- ***  
- ***  7. abgw is the background weight, the log of the (conditional)
- ***  background probability for that base divided by 0.25.  
- ***  
- ***  The sequence starts in two datastructures, a list (whose first
- ***  element is a dummy) and a set that contains all the "real" elements.
- ***  At each alignment step two sequences (elements of the set) are
- ***  broken up into 5 sequences (a shared portion and the non-shared
- ***  boundaries, which may be zero-length).  The original list always
- ***  contains the starting element, with pointer to the next fragment.
- ***  It's like a doubly-linked list (actually, a set of lists with 
- ***  crosslinks, if you like) with end fragments pointing to a 
- ***  "dummy" fragment.
- ***)
-
-type fasta_seq = { fheader : string; mutable fseq: Buffer.t}
-
-(*** Global variables ***)
-let prand = ref 0.002
-let correlbg = ref 2
-let bgfilename = ref ""
-let auxfilename = ref ""
-let singlesite = ref (Array.make 4 0.0)
-let dinucleotide = ref (Array.make_matrix 4 4 0.0)
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-(***  best_match: finds the most significant common substring with
- ***  mismatches.  The significance is basically the background 
- ***  probability of the string multiplied by the probability of
- ***  seeing a match of that quality in two long strings of the given
- ***  lengths.  The probability for fragment size l with string lengths
- ***  l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1)
- ***  where p = bg prob * (l choose m), where m = number of mismatches.
- ***  Lowest answer = most significant.
- ***)
-
-let best_match s1 s2 w1 w2 = 
-  let sl1 = String.length s1 in 
-  let sl2 = String.length s2 in
-  let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in
-  let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
-  let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
-  let best = ref 1.0 in
-  let bestm = ref 0 in
-  let bestn = ref 0 in
-  let lbest = ref 0 in
-    for m = 1 to sl1  do
-      for n = 1 to sl2  do 
-        let a1 = s1.[m-1] in
-        let b1 = s2.[n-1] in
-        let f_old = fmat.(m-1).(n-1) in
-        let m_old = mmat.(m-1).(n-1) in
-        let l_new = lmat.(m-1).(n-1) + 1 in
-        let m_new, f_new =
-          if (a1=b1) && (a1 != 'N') 
-          then 
-            m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1))
-              *. (float l_new) /. (float (l_new-m_old))
-          else  m_old + 1, f_old *. (float l_new) /. (float (m_old + 1))
-        in 
-        let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in
-          ( fmat.(m).(n) <- f_new;
-            lmat.(m).(n) <- l_new; 
-            mmat.(m).(n) <- m_new;
-	    if (f_new < !best) then
-	      ( best := f_new; bestm := m; bestn := n)
-          ) 
-      done
-    done;
-    while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0) 
-      && (!lbest < !bestm) && (!lbest < !bestn) do
-        lbest := !lbest + 1
-    done;
-    best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1));
-    if (!best < !prand)
-    then  !bestm - !lbest, !bestn - !lbest, !lbest, !best
-    else 0,0,0,1.0
-;;
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-let compareSeq seq1 seq2 =
-  (***  if seqindex is same in both cases then check fragment number
-   ***  from "seqlabel", otherwise IntSet.compare *)
-  (***  this is needed to create the module SeqSet, but isn't used,
-   ***  except to the extent that this should not return 0 unless
-   ***  two frags really are identical, which should not happen *)
-    match (IntSet.compare seq1.seqindex seq2.seqindex ) with
-        0 -> ( let rec complabel = function
-                 |  [] -> 0
-                 | a::t -> 
-                     match (compare (Hashtbl.find seq1.seqlabel a)
-                                    (Hashtbl.find seq2.seqlabel a)) with
-                         0 -> complabel t
-                       | n -> n 
-               in complabel (IntSet.elements seq1.seqindex))
-      | n -> n ;;
-
-
-module SeqSet = Set.Make (struct type t=seq_frag
-                                 let compare=compareSeq end );;
-
-let seqset = ref SeqSet.empty ;;
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-let readlines_noblank ffilename = (* works like python's readlines *)
-  let fchan = open_in ffilename in
-  let lenstr = in_channel_length fchan in
-  let res = String.create lenstr in 
-    really_input fchan res 0 lenstr ;
-    Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;;
-
-let newfseq h =   { header = h;
-                    seq = Hashtbl.create 5;
-		    aseq = "" ;
-		    abgw = Array.create 0 0.0;
-                    seqindex = IntSet.empty; 
-                    seqlabel = Hashtbl.create 5;
-                    neighbours = Hashtbl.create 10;
-                    limits = Hashtbl.create 5 };;
-
-let readfasta filename = (*   reads a fasta sequence, returns it as a list 
-                          ***  with a dummy first element used as a marker *)
-  let flines = readlines_noblank filename in
-  let rec flines2flist flist flines fcurr ln =
-    if List.length flines = 0 then
-      fcurr::flist
-    else
-      match (List.hd flines).[0] with
-          '>' -> 
-            let fcurr2 = newfseq (List.hd flines)
-            in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1)
-        | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln
-        | _ -> 
-            let fcurr2 = { header = fcurr.header; seq=fcurr.seq;
-                           aseq = fcurr.aseq ^ (String.lowercase 
-                                                  (List.hd flines)) ;
-			   abgw = Array.create 0 0.0;
-                           seqlabel= fcurr.seqlabel ;
-                           neighbours = fcurr.neighbours;
-                           seqindex = fcurr.seqindex;
-                           limits = fcurr.limits
-                         } 
-            in flines2flist flist (List.tl flines) fcurr2 ln
-  in 
-    List.rev (flines2flist [] flines (newfseq "" ) (-1));;
-
-
-let makefastaset seqlist =
-  let rec list2set a s =
-    match a with 
-        [] -> s
-      | h::t -> list2set t (SeqSet.add h s)
-  in list2set (List.tl seqlist) (SeqSet.empty);;
-
-
-let basenum  = function
-  | 'a' | 'A' -> 0
-  | 'c' | 'C' -> 1
-  | 'g' | 'G' -> 2
-  | 't' | 'T' -> 3
-  | _ -> -1
-;;
-
-
-let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide
-				       from bgfile *)
-  let bglines = readlines_noblank bgfile in
-  let line2val line1 =
-    let line1s=Str.split (Str.regexp "[ \t]+") line1 in
-      if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3)
-      then 
-	let bs = List.hd line1s in
-	  if (String.length bs = 1) then
-	    !singlesite.(basenum bs.[0]) <-
-	      (float_of_string (List.nth line1s 1))
-	else 
-	  !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <-
-	    (float_of_string (List.nth line1s 1))
-      else ()
-  in 
-    List.iter line2val bglines;;
-
-
-let getbgprobs auxseqlist =
-  (*** assume !singlesite, !dinucleotide are initialised to zero ***)
-  List.iter 
-    (fun f -> 
-       for n = 0 to (String.length f.aseq) - 1 do
-         let c = basenum f.aseq.[n] in
-           if c >= 0 then 
-             (!singlesite.(c) <- !singlesite.(c) +. 1.0;
-              !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0;
-              if n>0 then
-                let c1 = basenum f.aseq.[n-1] in
-                  if c1 >=0 then
-                    (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0;
-                    )
-             )
-       done 
-    ) auxseqlist 
-;;
-
-
-let normalise_bgprobs () =
-  (let tots = ref 0.0 in
-     (for n = 0 to 3 do
-        tots := !tots +. !singlesite.(n)
-      done; 
-      for n = 0 to 3 do
-        !singlesite.(n) <- !singlesite.(n) /. !tots
-      done
-     )
-  );
-  for n = 0 to 3 do 
-    let totd = ref 0.0 in
-      (for m = 0 to 3 do
-         totd := !totd +. !dinucleotide.(n).(m)
-       done;
-       for m = 0 to 3 do
-         !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd
-       done
-      )
-  done 
-;;
-
-let getwts s =
-  let w = Array.make (String.length s) 0.0 in
-    ( for n = 0 to (String.length s) - 1 do
-        if !correlbg = 0 then w.(n) <- 0.25
-        else 
-          let cn = basenum s.[n] in
-            if cn<0 then w.(n) <- 1.
-            else if n=0 then w.(n) <- !singlesite.(cn)
-            else
-              let cn1 = basenum s.[n-1] in
-                if cn1 < 0 || !correlbg = 1 
-	        then w.(n) <- !singlesite.(cn)
-                else w.(n) <- !dinucleotide.(cn1).(cn)
-      done;
-      w
-    )
-;;
-
-
-let filename2seqlist filenameset gotbgprobs =
-  let fastalist1 =
-    try 
-      readfasta (List.hd filenameset)
-    with 
-	_ -> ( Printf.fprintf stderr 
-                 "Error: unable to read file %s\n" (List.hd filenameset);
-               exit 1)
-  in     
-  let fastalist2l =
-    List.map 
-      (fun filename ->
-         try 
-           readfasta filename
-         with 
-             _ -> ( Printf.fprintf stderr 
-                      "Error: unable to read file %s\n" filename;
-                    exit 1)
-      ) (List.tl filenameset)
-  in
-  let makeseqlist = function
-    | [] -> []
-    | h::t ->
-        List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t
-  in
-  let fastalist = fastalist1 @ (makeseqlist fastalist2l) in
-    ( if gotbgprobs then () else
-	( getbgprobs fastalist;
-          normalise_bgprobs () 
-	);
-      List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist;
-      fastalist
-    )
-;;
-
-
-
-let rec blockseq seql seqnumset maxl maxw =
-  (***  the main routine, recursively forms blocks until no more are
-	***  possible *)
-  (***  first, make a list of best pairwise matches, removing matches
-	***  that conflict with better matches; then pairwise-align those, 
-	***  one by one; then call self again.  If null list, quit. *)
-
-  let seqhead = List.hd seql in
-  let seqlist = List.tl seql in
-
-  let find_neigh = function
-    | None -> seqhead 
-    | Frag a -> a
-  in 
-
-  let update_limits seqfrag = (* updates limits for consistent alignment *)
-    let rec sweep_limits frag l_stop r_start m =
-      let lim = seqfrag.limits in
-        if frag==seqhead then () 
-        else
-          let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in
-          let thislabel = Hashtbl.find frag.seqlabel m in
-          let fix_limits n =
-            ( ( if thislabel <= l_stop then
-                  let newlim = 
-		    try Hashtbl.find lim (n,'r') 
-		    with _ -> "0.3"
-		  in
-		  let oldlim = 
-		    try Hashtbl.find frag.limits (n,'r')
-		    with _ -> "0.3"
-		  in 
-                    if oldlim > newlim
-                    then Hashtbl.replace frag.limits (n,'r') newlim);
-              ( if thislabel >= r_start then
-                  let newlim = 
-		    try Hashtbl.find lim (n,'l') 
-		    with _ -> "0"
-		  in
-		  let oldlim =
-		    try Hashtbl.find frag.limits (n,'l')
-		    with _ -> "0"
-		  in 
-                    if oldlim < newlim
-                    then Hashtbl.replace frag.limits (n,'l') newlim))
-          in 
-            (IntSet.iter fix_limits seqnumset;
-             sweep_limits nextfrag l_stop r_start m)
-
-    in (
-        IntSet.iter  (* Set "diagonal" limits to self *)
-          (fun n -> 
-             (Hashtbl.replace seqfrag.limits (n,'l') 
-		(Hashtbl.find seqfrag.seqlabel n) ;
-              Hashtbl.replace seqfrag.limits (n,'r') 
-		(Hashtbl.find seqfrag.seqlabel  n)))
-          seqfrag.seqindex;
-        IntSet.iter (*   replace each limit with most stringent limit of
-                     ***  a neighbouring fragment *)
-          (fun n -> 
-             ( IntSet.iter 
-                 (fun m ->
-                    (let fragl = find_neigh 
-		       (Hashtbl.find seqfrag.neighbours (m,'l')) in
-                     let limn = 
-		       try Hashtbl.find fragl.limits (n,'l') 
-		       with _ -> "0"
-		     in
-		     let seqfraglim = 
-		       try Hashtbl.find seqfrag.limits (n,'l')
-		       with _ -> "0"
-		     in 
-                       if limn > seqfraglim
-                       then Hashtbl.replace seqfrag.limits (n,'l') limn );
-                    (let fragr = find_neigh 
-		       (Hashtbl.find seqfrag.neighbours (m,'r')) in
-                     let limn = 
-		       try Hashtbl.find fragr.limits (n,'r') 
-		       with _ -> "0.3" 
-		     in
-		     let seqfraglim = 
-		       try Hashtbl.find seqfrag.limits (n,'r') 
-		       with _ -> "0.3"
-		     in 
-                       if limn < seqfraglim
-                       then Hashtbl.replace seqfrag.limits (n,'r') limn )
-                 ) seqfrag.seqindex )
-          ) seqnumset;
-        let do_sweep n =
-          let initfrag = (List.nth seqlist n) in
-	  let llimit =
-	    try Hashtbl.find seqfrag.limits (n,'l')
-	    with _ -> "0"
-	  in 
-	  let rlimit = 
-	    try Hashtbl.find seqfrag.limits (n,'r')
-	    with _ -> "0.3"
-	  in 
-            sweep_limits initfrag llimit rlimit n
-        in 
-          IntSet.iter do_sweep seqnumset
-      )
-  in 
-
-  let inconsistent_frags frag1 frag2  =
-    let rec inconsistent_indices indexlist1 indexlist2 =
-      (***  check that the frags don't lie outside allowed ranges for
-	    ***  one another *)
-      let rec inconsistent_indices_onepair index1 indexlist2 =
-        match indexlist2 with
-            [] -> false
-          | h::t -> 
-	      let lim2l =
-		try Hashtbl.find frag2.limits (index1,'l') 
-		with _ -> "0"
-	      in 
-	      let lim2r =
-		try Hashtbl.find frag2.limits (index1,'r') 
-		with _ -> "0.3"
-	      in 
-	      let lim1l = 
-		try Hashtbl.find frag1.limits (h,'l')
-		with _ -> "0"
-	      in 
-	      let lim1r =
-		try Hashtbl.find frag1.limits (h,'r')
-		with _ -> "0.3"
-	      in 
-                if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0) 
-                  || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0)
-                  || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0)
-                  || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0)
-                then true
-                else inconsistent_indices_onepair index1 t
-      in 
-        match indexlist1 with
-            [] -> false
-          | h::t -> 
-              if inconsistent_indices_onepair h indexlist2
-              then true
-              else inconsistent_indices t indexlist2
-    in inconsistent_indices (IntSet.elements frag1.seqindex) 
-         (IntSet.elements frag2.seqindex) 
-  in 
-
-
-  let rec make_block_list fraglist outlist =
-    (***  make a list of best matches, to be traversed by fuse_block_list *)
-    (***  note: at least one of the frags being paired must have 
-	  ***  "paired"=true *)
-    let rec one_block frag rest_of_list outlist = 
-      match rest_of_list with
-          [] -> outlist
-        | a::t -> 
-	    if 
-	      (frag.aseq != "") && (a.aseq != "")	      
-            then 
-	      let n1,n2,l,p = 
-		best_match frag.aseq a.aseq frag.abgw a.abgw
-	      in
-                if p<1.0 then
-		  let s1 = IntSet.elements frag.seqindex in
-		  let s2 = IntSet.elements a.seqindex in
-		    one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist)
-                else 
-                  one_block frag t outlist
-	    else 
-	      one_block frag t outlist
-    in 
-      match fraglist with
-          [] -> outlist
-        | a::t -> 
-	    let t1 = 
-	      List.filter (
-		fun f -> not (inconsistent_frags a f)) t in
-	      make_block_list t (one_block a t1 outlist)
-  in 
-
-
-  let fuse_block_list blocklist =
-
-    let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
-      if n1 >= (String.length frag1.aseq) 
-      then
-        let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r'))
-        in
-          firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1,
-                     frag2,n2,s2,l)
-      else if n2 >= (String.length frag2.aseq)
-      then
-        let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r'))
-        in
-          firstfrag (p,frag1,n1,s1,
-                     frag2b,n2-(String.length frag2.aseq), s2,l)
-      else (p,frag1,n1,s1,frag2,n2,s2,l)
-    in 
-
-    let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
-      let f1gap = ((String.length frag1.aseq) - n1) in
-      let f2gap = ((String.length frag2.aseq) - n2) in
-	if (f1gap < f2gap) then
-	  let frag1b = 
-	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
-	  let lb = l - f1gap in
-	    (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
-	    (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb)
-	else if (f1gap > f2gap) then 
-	  let frag2b =
-	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
-	  let lb = l - f2gap in
-	    (p,frag1,n1,s1,frag2,n2,s2,f2gap), 
-	    (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb)
-	else
-	  let frag1b = 
-	    find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
-	  let frag2b = 
-	    find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
-	  let lb = l - f1gap in
-            (p,frag1,n1,s1,frag2,n2,s2,f1gap), 
-            (p,frag1b,0,s1,frag2b,0,s2,lb)
-    in 
-
-    let rec expand_block pair1 outlist =
-      let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in
-        (***  if the block, as annotated, is no longer in one fragment because
-              ***  of other fusings, then convert this block into a list of
-              ***  single-fragment blocks *)
-        if inconsistent_frags frag1 frag2
-        then []
-        else
-          if String.length (frag1.aseq) > n1+l-1 &&
-            String.length (frag2.aseq) > n2+l-1 
-          then (pair1::outlist)
-          else 
-            if frag1=seqhead || frag2=seqhead then
-              (print_endline "problem";
-               outlist)
-            else
-              let pair1,pair2 = nextfrag pair1 in
-                expand_block pair2 (pair1::outlist)
-    in 
-
-    let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) =
-      let consensus s1 s2 w1 w2 =
-        for i = 0 to (String.length s1) -1 do
-          if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.)
-        done;
-        s1, w1
-      in 
-	(***  fuse the two frags; it is assumed that expand_block has
-              ***  already been called so n1+l and n2+l won't exceed the length
-              ***  of the frag *)
-	(***  divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5
-              ***  where frag3 is the "shared piece" so the original frags are
-              ***  split into frag1->frag3->frag4 and frag2->frag3->frag5 *)
-
-	(
-          let tempstr1=String.copy frag1.aseq in
-          let tempstr2=String.copy frag2.aseq in 
-	  let tempbgw1=Array.copy frag1.abgw in
-	  let tempbgw2=Array.copy frag2.abgw in 
-	    (
-              frag1.aseq <- String.sub tempstr1 0 n1 ;
-              frag2.aseq <- String.sub tempstr2 0 n2 ;
-	      frag1.abgw <- Array.sub tempbgw1 0 n1;
-	      frag2.abgw <- Array.sub tempbgw2 0 n2;
-              let frag3 = 
-		let as3, aw3 = 
-		  consensus (String.sub tempstr1 n1 l) 
-		    (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l) 
-		    (Array.sub tempbgw2 n2 l)
-		in 
-		  { header = ""; 
-                    aseq = as3;
-		    abgw = aw3;
-		    seq = Hashtbl.copy frag1.seq;
-                    seqindex=IntSet.union frag1.seqindex frag2.seqindex;
-                    seqlabel= Hashtbl.copy frag1.seqlabel;
-                    neighbours=Hashtbl.copy frag1.neighbours;
-                    limits = Hashtbl.copy frag1.limits
-                  } in 
-              let frag4 = { header = ""; 
-                            aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l);
-			    abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l);
-			    seq = Hashtbl.copy frag1.seq;
-                            seqindex= IntSet.union frag1.seqindex IntSet.empty;
-                            seqlabel= Hashtbl.copy frag1.seqlabel;
-                            neighbours= Hashtbl.copy frag1.neighbours;
-                            limits = Hashtbl.copy frag1.limits
-                          } in 
-              let frag5 = { header = ""; 
-                            aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l);
-                            abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l);
-			    seq = Hashtbl.copy frag2.seq;
-                            seqindex=IntSet.union frag2.seqindex IntSet.empty;
-                            seqlabel= Hashtbl.copy frag2.seqlabel;
-                            neighbours= Hashtbl.copy frag2.neighbours;
-                            limits = Hashtbl.copy frag2.limits
-                          } 
-              in (
-                  IntSet.iter 
-                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
-				 (Frag frag1);
-                               Hashtbl.replace frag3.neighbours (n,'r') 
-				 (Frag frag4);
-                               Hashtbl.replace frag1.neighbours (n,'r') 
-				 (Frag frag3);
-                               Hashtbl.replace frag4.neighbours (n,'l') 
-				 (Frag frag3);
-
-			       Hashtbl.replace frag3.seq n
-				 (String.sub (Hashtbl.find frag1.seq n) n1 l);
-			       Hashtbl.replace frag4.seq n
-				 (String.sub (Hashtbl.find frag1.seq n) 
-				    (n1+l) ((String.length tempstr1)-n1-l));
-			       Hashtbl.replace frag1.seq n
-				 (String.sub (Hashtbl.find frag1.seq n) 0 n1);
-
-                               Hashtbl.replace frag3.seqlabel n 
-				 ((Hashtbl.find frag1.seqlabel n) ^ "1");
-                               Hashtbl.replace frag4.seqlabel n  
-				 ((Hashtbl.find frag4.seqlabel n) ^ "2");
-                               Hashtbl.replace frag1.seqlabel n  
-				 ((Hashtbl.find frag1.seqlabel n) ^ "0");
-                              )) frag1.seqindex;
-                  IntSet.iter 
-                    (fun n -> (Hashtbl.replace frag3.neighbours (n,'l') 
-				 (Frag frag2);
-                               Hashtbl.replace frag3.neighbours (n,'r') 
-				 (Frag frag5);
-                               Hashtbl.replace frag2.neighbours (n,'r') 
-				 (Frag frag3);
-                               Hashtbl.replace frag5.neighbours (n,'l') 
-				 (Frag frag3);
-
-			       Hashtbl.replace frag3.seq n
-				 (String.sub (Hashtbl.find frag2.seq n) n2 l);
-			       (try
-				  Hashtbl.replace frag5.seq n
-				    (String.sub (Hashtbl.find frag2.seq n) 
-				       (n2+l) ((String.length tempstr2)-n2-l))
-				with
-				    _ -> 
-				      print_endline (Hashtbl.find frag2.seq n)
-			       ); 
-			       Hashtbl.replace frag2.seq n
-				 (String.sub (Hashtbl.find frag2.seq n) 0 n2);
-
-                               Hashtbl.replace frag3.seqlabel n 
-				 ((Hashtbl.find frag2.seqlabel n) ^ "1");
-                               Hashtbl.replace frag5.seqlabel n 
-				 ((Hashtbl.find frag5.seqlabel n) ^ "2");
-                               Hashtbl.replace frag2.seqlabel n 
-				 ((Hashtbl.find frag2.seqlabel n) ^ "0")
-                              )) frag2.seqindex;
-
-                  seqset := SeqSet.add frag3 !seqset;
-                  seqset := SeqSet.add frag4 !seqset;
-                  seqset := SeqSet.add frag5 !seqset;
-                  update_limits frag3;
-		)
-            )
-	)
-    in 
-
-    let fuse_one_frag f = 
-      let expanded = expand_block (firstfrag f) [] in
-        if (List.for_all 
-              (fun (p,frag1,n1,s1,frag2,n2,s2,l) ->
-                 not (inconsistent_frags frag1 frag2))
-              expanded) 
-        then 
-          List.iter fuse_one_block expanded
-    in 
-      List.iter fuse_one_frag blocklist
-  in 
-  let fraglist = SeqSet.elements !seqset in
-  let blocklist =List.sort compare (make_block_list fraglist []) in
-    match blocklist with 
-        [] -> []
-      | _ -> (fuse_block_list blocklist;
-	      let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in
-		blockseq seql seqnumset maxl maxw)
-;;
-
-(***                                                              ***)
-(***  Create aligned sequence out of the bunch of fragments       ***)
-(***  produced by blockseq                                        ***)
-(***                                                              ***)
-
-
-let rec assemble_frags_to_seqs seqhead seqlist foutlist  =
-
-  let is_complete frag = (*   if frag is multi-sequence, all copies
-                          *** are at head of seqlist *)
-    if frag == seqhead then false else
-    if IntSet.cardinal frag.seqindex = 1 then true else
-      let nfragmatches =
-        List.fold_left 
-          (fun n fr -> 
-	     if IntSet.compare frag.seqindex fr.seqindex = 0
-             then n+1 else n) 0 seqlist
-      in 
-        if nfragmatches = IntSet.cardinal frag.seqindex
-        then true else false
-  in 
-
-  let no_other_blocks frag = (*   No other complete frag has a smaller
-                              *** seqindex in seqlist *)
-    if IntSet.cardinal frag.seqindex = 1 then true else
-      List.fold_left 
-        (fun b fr -> 
-           if (IntSet.cardinal fr.seqindex > 0) &&
-             (IntSet.compare fr.seqindex frag.seqindex = -1) &&
-             is_complete fr
-           then false else b) true seqlist
-  in 
-
-  let maxlen fragwidth seqindex = (*    For a complete multi-sequence
-                                   ***  frag, it must be aligned when
-                                   ***  tacked on, and can't overlap an
-                                   ***  existing sequence; so find
-                                   ***  "padding" length *)
-    let maxlen_blockseqs =
-      IntSet.fold (fun idx n ->
-                     let fout = (List.nth foutlist idx) in
-                       if Buffer.length fout.fseq > n
-                       then Buffer.length fout.fseq else n) seqindex 0
-    in 
-    let push_up fs m =
-      let m1 = ref m in
-        (while 
-           (fs.[!m1] =Char.lowercase fs.[!m1]) do
-           m1 := !m1 + 1
-         done; 
-         while (!m1 < String.length fs)
-           &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do
-           m1 := !m1 + 1
-         done; 
-         !m1)
-    in 
-    let rec overlap fs m1 n =
-      if n = fragwidth then false
-      else if (m1+n) >= String.length fs then false
-      else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true
-      else overlap fs m1 (n+1)
-    in 
-    let mx = ref maxlen_blockseqs in
-      (  for n = 0 to (List.length foutlist - 1) do
-         let fs = Buffer.contents (List.nth foutlist n).fseq in
-           (while overlap fs !mx 0 do
-              mx := push_up fs !mx
-            done ) 
-       done;
-       !mx )
-  in 
-
-  let find_neigh neigh1 =
-    match neigh1 with
-        None -> seqhead 
-      | Frag a -> a
-  in 
-
-  let rec process_frag_list seql foutl newseqlist n oldmaxlen =
-    match seql with
-        [] -> List.rev newseqlist
-      | frag::seqt ->
-	  let fragseq = Hashtbl.find frag.seq n in
-          let fout = List.hd foutl in
-            if (is_complete frag) && (no_other_blocks frag) then
-              ( if IntSet.cardinal frag.seqindex = 1 then
-                  (Buffer.add_string fout.fseq fragseq;
-                   let nextfrag = find_neigh 
-		     (Hashtbl.find frag.neighbours (n,'r')) in
-                     process_frag_list seqt (List.tl foutl) 
-                       (nextfrag::newseqlist) (n+1) oldmaxlen
-                  )
-                else (
-                  if oldmaxlen = 0 && 
-                    (Buffer.length fout.fseq > 0 || 
-                     n = List.hd (IntSet.elements frag.seqindex)) then (
-                    (*** Find padding value, unless we're just starting ***)
-                    (*** If just starting, make sure this is the first  ***)
-                    (*** in the block                                   ***)
-                    let newmaxlen = (maxlen (String.length fragseq) 
-                                       frag.seqindex) in
-                      while Buffer.length fout.fseq < newmaxlen do
-                        Buffer.add_char fout.fseq '-'
-                      done
-                  ) else
-                    while Buffer.length fout.fseq < oldmaxlen do
-                      Buffer.add_char fout.fseq '-'
-                    done;
-                  Buffer.add_string fout.fseq (String.uppercase fragseq);
-                  let nextfrag 
-		      = find_neigh (Hashtbl.find frag.neighbours (n,'r'))
-                  in 
-                    process_frag_list seqt (List.tl foutl) 
-                      (nextfrag::newseqlist) (n+1)
-                      ((Buffer.length fout.fseq) - (String.length fragseq))
-                )
-              )
-            else 
-              process_frag_list seqt (List.tl foutl)
-                (frag::newseqlist) (n+1) oldmaxlen
-  in 
-  let newseqlist = process_frag_list seqlist foutlist [] 0 0
-  in 
-  let end_list =
-    List.fold_left 
-      (fun b fr -> if fr==seqhead then b else false) true newseqlist
-  in
-    if end_list
-    then foutlist
-    else 
-      assemble_frags_to_seqs seqhead newseqlist foutlist
-;;
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-let print_aligned foutlist = 
-  let print_one_line foutlist n poslist =
-    let print_one_line_seq fout pos =
-      let newpos = ref pos in
-        for m = 0 to 49 do 
-          ( ( if m mod 50 = 0 then
-                let head = 
-                    if String.length fout.fheader < 11 
-                    then String.sub fout.fheader 1 
-                      (String.length fout.fheader -1)
-                    else String.sub fout.fheader 1 10 
-                in 
-                  ( print_string head;
-                    (if (String.length head < 10) then
-                       print_string 
-                         (String.make (10-(String.length head)) ' '));
-                    print_string "  "
-                  )
-              else if ((m) mod 10) = 0 then 
-                print_string " "
-            );
-            (let ch = 
-               if n+m < Buffer.length fout.fseq
-               then (Buffer.contents fout.fseq).[m+n] 
-               else ' ' in
-               (print_char ch;
-                if ch = '-' or ch = ' ' then
-                  ()
-                else 
-                  newpos := !newpos+1
-               )
-            );
-            if ((m+1) mod 50) = 0 then (
-              print_string "  ";
-              print_int !newpos;
-              print_string "\n";
-            )) 
-        done;
-        !newpos
-    in 
-      List.map2 print_one_line_seq foutlist poslist
-  in              
-    
-  let rec print_aligned foutlist n poslist =
-    (print_endline "";
-     if (List.fold_left (fun b f -> 
-                           if n < (Buffer.length f.fseq) then true else b) 
-           false foutlist) then 
-       print_aligned foutlist (n+50) (print_one_line foutlist n poslist)
-     else () 
-    )
-  in 
-  let poslist = List.map (fun f -> 0) foutlist
-  in print_aligned foutlist 0 poslist;;
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-let print_fasta foutlist =
-  List.iter 
-    (fun f ->
-       print_endline f.fheader;
-       print_endline (Buffer.contents f.fseq);
-       print_string "\n"
-    ) foutlist ;;
-
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-(***                                                              ***)
-
-(***  main program below *)
-
-let filenameset = ref [] ;;
-let outtype = ref "";;
-
-let argspeclist = 
-  [("-A", Arg.Unit 
-      (fun () -> outtype :=  !outtype^"A"), 
-      "aligned output (compare -F option) (default: only this)");
-   ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which "
-      ^ "to read background sequences (overridden by -B)");
-   ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities");
-   ("-F", Arg.Unit 
-      (fun () -> outtype := !outtype^"F"), 
-      "fasta output (can use both -A and -F in either order)");
-   ("-n", Arg.Set_int correlbg, "background correlation (default 2="
-      ^ "dinucleotide;\n     1=single-site basecounts, 0=0.25 per base)");
-   ("-x", Arg.Set_float prand, "set limit for how probable the match "
-      ^ "is by chance\n     (default 0.002, smaller=more stringent)")]
-in 
-let message = 
-  String.concat ""
-    ["Sigma, version 1.0: simple greedy multiple alignment\n";
-     "Copyright (C) 2006 Rahul Siddharthan <rsidd at imsc.res.in>\n";
-     "Licensed under the GNU General Public License, version 2\n\n";
-     "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n";
-     "Each fasta file may contain a single sequence or multiple sequences;\n";
-     "all sequences will be aligned together.\n\n";
-     "Options:"]
-in                 
-  Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;;
-
-
-if !outtype = "" then outtype := "A";;
-
-
-if !filenameset = [] then
-  (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n";
-   exit 1);;
-
-
-if !correlbg > 2 then 
-  (print_endline "-n > 2 not implemented; using 2";
-   correlbg := 2)
-else if !correlbg < 0 then
-  (print_endline "-n should be >= 0; assuming 0";
-   correlbg := 0)
-;;
-
-let gotbgprobs = ref (!correlbg = 0);;
-
-if !bgfilename > "" then
-  try 
-    ( getbgprobs_from_bgfile !bgfilename;
-      normalise_bgprobs ();
-      gotbgprobs := true 
-    )
-  with _ -> 
-    Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename
-;;
-
-if (not !gotbgprobs) then (
-  let auxseqlist = 
-    match !auxfilename with
-	"" -> []
-      | _ -> 
-          try readfasta !auxfilename with
-              _ -> (Printf.fprintf stderr 
-		      "Couldn't open file %s : using input sequences\n" 
-		      !auxfilename;
-                    [])
-  in 
-    if auxseqlist != [] then
-      ( getbgprobs auxseqlist;
-        normalise_bgprobs () ;
-	gotbgprobs := true
-      )
-)
-;;
-
-let seqlist = filename2seqlist !filenameset !gotbgprobs ;;
-
-match List.length seqlist with
-    1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1)
-  | 2 -> (Printf.fprintf stderr 
-            "Only one sequence read, no alignment needed\n"; exit 0)
-  | _ -> ()
-;;
-
-let nel=(List.length seqlist) - 2 in
-  for n = 0 to nel do
-    let h = List.nth seqlist (n+1) in
-      ( h.seqindex <- IntSet.add n IntSet.empty ;
-        Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist));
-        Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist));
-        Hashtbl.replace h.seqlabel n "0.";
-        Hashtbl.replace (List.hd seqlist).seqlabel n "-1";
-        Hashtbl.replace h.seq n (String.copy h.aseq);
-	(* h.abgw <- getwts h.aseq; *)
-	(* h.abgw <- Array.create (String.length h.aseq) 0.0; *)
-        Hashtbl.replace (List.hd seqlist).seq n "";
-      )
-  done;;
-
-seqset := makefastaset seqlist;
-
-let seqnumset =
-  let rec range n r = 
-    if n<0 then r else range (n-1) (IntSet.add n r)
-  in range (List.length seqlist - 2) IntSet.empty
-in blockseq seqlist seqnumset 10000 0.;;
-
-let fout = List.map (fun fr -> {fheader=fr.header; 
-                                fseq = Buffer.create 1000} ) 
-             (List.tl seqlist) ;;
-
-assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;;
-
-String.iter (fun c -> if c='A' then print_aligned fout else
-               print_fasta fout) !outtype;;




More information about the debian-med-commit mailing list