[med-svn] r105 - in trunk/packages/sigma-align/trunk: . debian
Charles Plessy
charles-guest at costa.debian.org
Fri Sep 8 03:56:07 UTC 2006
Author: charles-guest
Date: 2006-09-08 03:56:06 +0000 (Fri, 08 Sep 2006)
New Revision: 105
Added:
trunk/packages/sigma-align/trunk/sigma.ml
Modified:
trunk/packages/sigma-align/trunk/debian/control
trunk/packages/sigma-align/trunk/debian/rules
Log:
conforming to the ocaml policy
Modified: trunk/packages/sigma-align/trunk/debian/control
===================================================================
--- trunk/packages/sigma-align/trunk/debian/control 2006-09-05 15:40:25 UTC (rev 104)
+++ trunk/packages/sigma-align/trunk/debian/control 2006-09-08 03:56:06 UTC (rev 105)
@@ -3,12 +3,12 @@
Priority: optional
Maintainer: Debian-Med Packaging Team <debian-med-packaging at lists.alioth.debian.org>
Uploaders: Charles Plessy <charles-debian-nospam at plessy.org>
-Build-Depends: debhelper (>= 5), ocaml-nox, xsltproc, docbook-xsl, docbook-xml
+Build-Depends: debhelper (>= 5), ocaml-best-compilers, xsltproc, docbook-xsl, docbook-xml
Standards-Version: 3.7.2
Package: sigma-align
Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}
+Depends: ocaml-base-nox-${F:OCamlABI}, ${shlibs:Depends}, ${misc:Depends}
Description: Simple greedy multiple alignment of non-coding DNA sequences
Sigma ("Simple greedy multiple alignment") is an alignment program with a new
algorithm and scoring scheme designed specifically for non-coding DNA
Modified: trunk/packages/sigma-align/trunk/debian/rules
===================================================================
--- trunk/packages/sigma-align/trunk/debian/rules 2006-09-05 15:40:25 UTC (rev 104)
+++ trunk/packages/sigma-align/trunk/debian/rules 2006-09-08 03:56:06 UTC (rev 105)
@@ -13,7 +13,11 @@
build:
dh_testdir
- ocamlopt str.cmxa sigma.ml -o sigma
+ if [ -x /usr/bin/ocamlopt ]; then \
+ ocamlopt str.cmxa sigma.ml -o sigma ; \
+ else \
+ ocamlc str.cma sigma.ml -o sigma ; \
+ fi
xsltproc -o debian/ -''-nonet /usr/share/xml/docbook/stylesheet/nwalsh/manpages/docbook.xsl debian/sigma.1.xml
touch build
@@ -34,18 +38,16 @@
binary-arch: build install
dh_testdir
dh_testroot
- dh_installchangelogs
- dh_installdocs
- dh_installexamples
+ dh_installdocs sigma.ml
dh_install sigma usr/bin
dh_installman debian/sigma.1
dh_link
- dh_strip
+# dh_strip
dh_compress
dh_fixperms
dh_installdeb
dh_shlibdeps
- dh_gencontrol
+ dh_gencontrol -s -- -VF:OCamlABI="$(OCAMLABI)"
dh_md5sums
dh_builddeb
Added: trunk/packages/sigma-align/trunk/sigma.ml
===================================================================
--- trunk/packages/sigma-align/trunk/sigma.ml (rev 0)
+++ trunk/packages/sigma-align/trunk/sigma.ml 2006-09-08 03:56:06 UTC (rev 105)
@@ -0,0 +1,1071 @@
+(*** SiGMA : Simple Greedy Multiple Alignment
+ *** Version 1.0 (March 11, 2006)
+ *** Copyright (C) Rahul Siddharthan, 2006
+ ***
+ *** Requires ocaml 3.x (http://caml.inria.fr)
+ *** When compiling, link the str module as follows:
+ *** bytecode compiler -> ocamlc str.cma sigma.ml -o sigma.bytecode
+ *** native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma
+ ***
+ *** (native is around 7x-10x faster, but no debugger support)
+ ***
+ *** To produce a static binary on linux (so that it's independent of
+ *** your machine's particular version of libc), use
+ *** ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma
+ ***)
+
+(*** Strategy: find most significant common substring between two strings
+ *** (mutations may exist but are taken care of in the scoring)
+ *** and "align" them. Then next largest common substring among
+ *** existing fragments that may be "consistently" aligned. Each time
+ *** a common substring is found it is "fused" into one fragment (with
+ *** multiple associated sequence numbers). Repeat until nothing left.
+ ***
+ *** Actually, for reasons of efficiency a list of "best matches" is made
+ *** and aligned with consistency checks, rather than doing it one
+ *** match at a time, but that's a detail.
+ ***)
+
+module IntSet = Set.Make (struct type t=int
+ let compare=compare end )
+
+(*** the main data type, the sequence fragment ***)
+type neigh_var = None | Frag of seq_frag
+and seq_frag = { header : string ;
+ seq : (int, string) Hashtbl.t ;
+ mutable aseq: string;
+ mutable abgw: float array;
+ mutable seqindex : IntSet.t;
+ seqlabel: (int, string) Hashtbl.t;
+ neighbours: (int * char, neigh_var) Hashtbl.t;
+ limits : (int * char, string) Hashtbl.t }
+
+(*** Notes:
+ ***
+ *** 1. seq is a hashtbl keyed by sequence index n, giving corresponding
+ *** sequence, since after alignment there may be mismatches.
+ ***
+ *** 2. aseq is a string giving the consensus of all aligned strings.
+ *** For now, a mismatch will be marked as N and subsequent bases
+ *** aligned to it will get a score zero.
+ ***
+ *** 3. seqindex is the original sequence to which the current sequence
+ *** fragment belongs; a set containing initially a single number, but,
+ *** as pairs are made, possibly multiple numbers
+ ***
+ *** 4. limits is a hashtbl keyed with pairs of sequence numbers and chars
+ *** (n,c='l'/'r') mapping to the legal start (l) or end (r) position on
+ *** sequence n for this fragment.
+ ***
+ *** 5. seqlabel is an array whose nth element is the label of that
+ *** fragment on sequence n, a string representation of a float between 0
+ *** and 1 (actually between 0 and 0.3). Each fragment on each sequence
+ *** will have a unique label.
+ ***
+ *** 6. neighbours is a hashtbl that takes as keys a pair of the
+ *** sequence number and a char 'l'/'r' and outputs a ref to the
+ *** neighbouring frag on that seq in that direction
+ ***
+ *** 7. abgw is the background weight, the log of the (conditional)
+ *** background probability for that base divided by 0.25.
+ ***
+ *** The sequence starts in two datastructures, a list (whose first
+ *** element is a dummy) and a set that contains all the "real" elements.
+ *** At each alignment step two sequences (elements of the set) are
+ *** broken up into 5 sequences (a shared portion and the non-shared
+ *** boundaries, which may be zero-length). The original list always
+ *** contains the starting element, with pointer to the next fragment.
+ *** It's like a doubly-linked list (actually, a set of lists with
+ *** crosslinks, if you like) with end fragments pointing to a
+ *** "dummy" fragment.
+ ***)
+
+type fasta_seq = { fheader : string; mutable fseq: Buffer.t}
+
+(*** Global variables ***)
+let prand = ref 0.002
+let correlbg = ref 2
+let bgfilename = ref ""
+let auxfilename = ref ""
+let singlesite = ref (Array.make 4 0.0)
+let dinucleotide = ref (Array.make_matrix 4 4 0.0)
+
+(*** ***)
+(*** ***)
+(*** ***)
+
+(*** best_match: finds the most significant common substring with
+ *** mismatches. The significance is basically the background
+ *** probability of the string multiplied by the probability of
+ *** seeing a match of that quality in two long strings of the given
+ *** lengths. The probability for fragment size l with string lengths
+ *** l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1)
+ *** where p = bg prob * (l choose m), where m = number of mismatches.
+ *** Lowest answer = most significant.
+ ***)
+
+let best_match s1 s2 w1 w2 =
+ let sl1 = String.length s1 in
+ let sl2 = String.length s2 in
+ let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in
+ let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
+ let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
+ let best = ref 1.0 in
+ let bestm = ref 0 in
+ let bestn = ref 0 in
+ let lbest = ref 0 in
+ for m = 1 to sl1 do
+ for n = 1 to sl2 do
+ let a1 = s1.[m-1] in
+ let b1 = s2.[n-1] in
+ let f_old = fmat.(m-1).(n-1) in
+ let m_old = mmat.(m-1).(n-1) in
+ let l_new = lmat.(m-1).(n-1) + 1 in
+ let m_new, f_new =
+ if (a1=b1) && (a1 != 'N')
+ then
+ m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1))
+ *. (float l_new) /. (float (l_new-m_old))
+ else m_old + 1, f_old *. (float l_new) /. (float (m_old + 1))
+ in
+ let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in
+ ( fmat.(m).(n) <- f_new;
+ lmat.(m).(n) <- l_new;
+ mmat.(m).(n) <- m_new;
+ if (f_new < !best) then
+ ( best := f_new; bestm := m; bestn := n)
+ )
+ done
+ done;
+ while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0)
+ && (!lbest < !bestm) && (!lbest < !bestn) do
+ lbest := !lbest + 1
+ done;
+ best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1));
+ if (!best < !prand)
+ then !bestm - !lbest, !bestn - !lbest, !lbest, !best
+ else 0,0,0,1.0
+;;
+
+(*** ***)
+(*** ***)
+(*** ***)
+
+let compareSeq seq1 seq2 =
+ (*** if seqindex is same in both cases then check fragment number
+ *** from "seqlabel", otherwise IntSet.compare *)
+ (*** this is needed to create the module SeqSet, but isn't used,
+ *** except to the extent that this should not return 0 unless
+ *** two frags really are identical, which should not happen *)
+ match (IntSet.compare seq1.seqindex seq2.seqindex ) with
+ 0 -> ( let rec complabel = function
+ | [] -> 0
+ | a::t ->
+ match (compare (Hashtbl.find seq1.seqlabel a)
+ (Hashtbl.find seq2.seqlabel a)) with
+ 0 -> complabel t
+ | n -> n
+ in complabel (IntSet.elements seq1.seqindex))
+ | n -> n ;;
+
+
+module SeqSet = Set.Make (struct type t=seq_frag
+ let compare=compareSeq end );;
+
+let seqset = ref SeqSet.empty ;;
+
+(*** ***)
+(*** ***)
+(*** ***)
+
+let readlines_noblank ffilename = (* works like python's readlines *)
+ let fchan = open_in ffilename in
+ let lenstr = in_channel_length fchan in
+ let res = String.create lenstr in
+ really_input fchan res 0 lenstr ;
+ Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;;
+
+let newfseq h = { header = h;
+ seq = Hashtbl.create 5;
+ aseq = "" ;
+ abgw = Array.create 0 0.0;
+ seqindex = IntSet.empty;
+ seqlabel = Hashtbl.create 5;
+ neighbours = Hashtbl.create 10;
+ limits = Hashtbl.create 5 };;
+
+let readfasta filename = (* reads a fasta sequence, returns it as a list
+ *** with a dummy first element used as a marker *)
+ let flines = readlines_noblank filename in
+ let rec flines2flist flist flines fcurr ln =
+ if List.length flines = 0 then
+ fcurr::flist
+ else
+ match (List.hd flines).[0] with
+ '>' ->
+ let fcurr2 = newfseq (List.hd flines)
+ in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1)
+ | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln
+ | _ ->
+ let fcurr2 = { header = fcurr.header; seq=fcurr.seq;
+ aseq = fcurr.aseq ^ (String.lowercase
+ (List.hd flines)) ;
+ abgw = Array.create 0 0.0;
+ seqlabel= fcurr.seqlabel ;
+ neighbours = fcurr.neighbours;
+ seqindex = fcurr.seqindex;
+ limits = fcurr.limits
+ }
+ in flines2flist flist (List.tl flines) fcurr2 ln
+ in
+ List.rev (flines2flist [] flines (newfseq "" ) (-1));;
+
+
+let makefastaset seqlist =
+ let rec list2set a s =
+ match a with
+ [] -> s
+ | h::t -> list2set t (SeqSet.add h s)
+ in list2set (List.tl seqlist) (SeqSet.empty);;
+
+
+let basenum = function
+ | 'a' | 'A' -> 0
+ | 'c' | 'C' -> 1
+ | 'g' | 'G' -> 2
+ | 't' | 'T' -> 3
+ | _ -> -1
+;;
+
+
+let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide
+ from bgfile *)
+ let bglines = readlines_noblank bgfile in
+ let line2val line1 =
+ let line1s=Str.split (Str.regexp "[ \t]+") line1 in
+ if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3)
+ then
+ let bs = List.hd line1s in
+ if (String.length bs = 1) then
+ !singlesite.(basenum bs.[0]) <-
+ (float_of_string (List.nth line1s 1))
+ else
+ !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <-
+ (float_of_string (List.nth line1s 1))
+ else ()
+ in
+ List.iter line2val bglines;;
+
+
+let getbgprobs auxseqlist =
+ (*** assume !singlesite, !dinucleotide are initialised to zero ***)
+ List.iter
+ (fun f ->
+ for n = 0 to (String.length f.aseq) - 1 do
+ let c = basenum f.aseq.[n] in
+ if c >= 0 then
+ (!singlesite.(c) <- !singlesite.(c) +. 1.0;
+ !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0;
+ if n>0 then
+ let c1 = basenum f.aseq.[n-1] in
+ if c1 >=0 then
+ (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0;
+ )
+ )
+ done
+ ) auxseqlist
+;;
+
+
+let normalise_bgprobs () =
+ (let tots = ref 0.0 in
+ (for n = 0 to 3 do
+ tots := !tots +. !singlesite.(n)
+ done;
+ for n = 0 to 3 do
+ !singlesite.(n) <- !singlesite.(n) /. !tots
+ done
+ )
+ );
+ for n = 0 to 3 do
+ let totd = ref 0.0 in
+ (for m = 0 to 3 do
+ totd := !totd +. !dinucleotide.(n).(m)
+ done;
+ for m = 0 to 3 do
+ !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd
+ done
+ )
+ done
+;;
+
+let getwts s =
+ let w = Array.make (String.length s) 0.0 in
+ ( for n = 0 to (String.length s) - 1 do
+ if !correlbg = 0 then w.(n) <- 0.25
+ else
+ let cn = basenum s.[n] in
+ if cn<0 then w.(n) <- 1.
+ else if n=0 then w.(n) <- !singlesite.(cn)
+ else
+ let cn1 = basenum s.[n-1] in
+ if cn1 < 0 || !correlbg = 1
+ then w.(n) <- !singlesite.(cn)
+ else w.(n) <- !dinucleotide.(cn1).(cn)
+ done;
+ w
+ )
+;;
+
+
+let filename2seqlist filenameset gotbgprobs =
+ let fastalist1 =
+ try
+ readfasta (List.hd filenameset)
+ with
+ _ -> ( Printf.fprintf stderr
+ "Error: unable to read file %s\n" (List.hd filenameset);
+ exit 1)
+ in
+ let fastalist2l =
+ List.map
+ (fun filename ->
+ try
+ readfasta filename
+ with
+ _ -> ( Printf.fprintf stderr
+ "Error: unable to read file %s\n" filename;
+ exit 1)
+ ) (List.tl filenameset)
+ in
+ let makeseqlist = function
+ | [] -> []
+ | h::t ->
+ List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t
+ in
+ let fastalist = fastalist1 @ (makeseqlist fastalist2l) in
+ ( if gotbgprobs then () else
+ ( getbgprobs fastalist;
+ normalise_bgprobs ()
+ );
+ List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist;
+ fastalist
+ )
+;;
+
+
+
+let rec blockseq seql seqnumset maxl maxw =
+ (*** the main routine, recursively forms blocks until no more are
+ *** possible *)
+ (*** first, make a list of best pairwise matches, removing matches
+ *** that conflict with better matches; then pairwise-align those,
+ *** one by one; then call self again. If null list, quit. *)
+
+ let seqhead = List.hd seql in
+ let seqlist = List.tl seql in
+
+ let find_neigh = function
+ | None -> seqhead
+ | Frag a -> a
+ in
+
+ let update_limits seqfrag = (* updates limits for consistent alignment *)
+ let rec sweep_limits frag l_stop r_start m =
+ let lim = seqfrag.limits in
+ if frag==seqhead then ()
+ else
+ let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in
+ let thislabel = Hashtbl.find frag.seqlabel m in
+ let fix_limits n =
+ ( ( if thislabel <= l_stop then
+ let newlim =
+ try Hashtbl.find lim (n,'r')
+ with _ -> "0.3"
+ in
+ let oldlim =
+ try Hashtbl.find frag.limits (n,'r')
+ with _ -> "0.3"
+ in
+ if oldlim > newlim
+ then Hashtbl.replace frag.limits (n,'r') newlim);
+ ( if thislabel >= r_start then
+ let newlim =
+ try Hashtbl.find lim (n,'l')
+ with _ -> "0"
+ in
+ let oldlim =
+ try Hashtbl.find frag.limits (n,'l')
+ with _ -> "0"
+ in
+ if oldlim < newlim
+ then Hashtbl.replace frag.limits (n,'l') newlim))
+ in
+ (IntSet.iter fix_limits seqnumset;
+ sweep_limits nextfrag l_stop r_start m)
+
+ in (
+ IntSet.iter (* Set "diagonal" limits to self *)
+ (fun n ->
+ (Hashtbl.replace seqfrag.limits (n,'l')
+ (Hashtbl.find seqfrag.seqlabel n) ;
+ Hashtbl.replace seqfrag.limits (n,'r')
+ (Hashtbl.find seqfrag.seqlabel n)))
+ seqfrag.seqindex;
+ IntSet.iter (* replace each limit with most stringent limit of
+ *** a neighbouring fragment *)
+ (fun n ->
+ ( IntSet.iter
+ (fun m ->
+ (let fragl = find_neigh
+ (Hashtbl.find seqfrag.neighbours (m,'l')) in
+ let limn =
+ try Hashtbl.find fragl.limits (n,'l')
+ with _ -> "0"
+ in
+ let seqfraglim =
+ try Hashtbl.find seqfrag.limits (n,'l')
+ with _ -> "0"
+ in
+ if limn > seqfraglim
+ then Hashtbl.replace seqfrag.limits (n,'l') limn );
+ (let fragr = find_neigh
+ (Hashtbl.find seqfrag.neighbours (m,'r')) in
+ let limn =
+ try Hashtbl.find fragr.limits (n,'r')
+ with _ -> "0.3"
+ in
+ let seqfraglim =
+ try Hashtbl.find seqfrag.limits (n,'r')
+ with _ -> "0.3"
+ in
+ if limn < seqfraglim
+ then Hashtbl.replace seqfrag.limits (n,'r') limn )
+ ) seqfrag.seqindex )
+ ) seqnumset;
+ let do_sweep n =
+ let initfrag = (List.nth seqlist n) in
+ let llimit =
+ try Hashtbl.find seqfrag.limits (n,'l')
+ with _ -> "0"
+ in
+ let rlimit =
+ try Hashtbl.find seqfrag.limits (n,'r')
+ with _ -> "0.3"
+ in
+ sweep_limits initfrag llimit rlimit n
+ in
+ IntSet.iter do_sweep seqnumset
+ )
+ in
+
+ let inconsistent_frags frag1 frag2 =
+ let rec inconsistent_indices indexlist1 indexlist2 =
+ (*** check that the frags don't lie outside allowed ranges for
+ *** one another *)
+ let rec inconsistent_indices_onepair index1 indexlist2 =
+ match indexlist2 with
+ [] -> false
+ | h::t ->
+ let lim2l =
+ try Hashtbl.find frag2.limits (index1,'l')
+ with _ -> "0"
+ in
+ let lim2r =
+ try Hashtbl.find frag2.limits (index1,'r')
+ with _ -> "0.3"
+ in
+ let lim1l =
+ try Hashtbl.find frag1.limits (h,'l')
+ with _ -> "0"
+ in
+ let lim1r =
+ try Hashtbl.find frag1.limits (h,'r')
+ with _ -> "0.3"
+ in
+ if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0)
+ || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0)
+ || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0)
+ || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0)
+ then true
+ else inconsistent_indices_onepair index1 t
+ in
+ match indexlist1 with
+ [] -> false
+ | h::t ->
+ if inconsistent_indices_onepair h indexlist2
+ then true
+ else inconsistent_indices t indexlist2
+ in inconsistent_indices (IntSet.elements frag1.seqindex)
+ (IntSet.elements frag2.seqindex)
+ in
+
+
+ let rec make_block_list fraglist outlist =
+ (*** make a list of best matches, to be traversed by fuse_block_list *)
+ (*** note: at least one of the frags being paired must have
+ *** "paired"=true *)
+ let rec one_block frag rest_of_list outlist =
+ match rest_of_list with
+ [] -> outlist
+ | a::t ->
+ if
+ (frag.aseq != "") && (a.aseq != "")
+ then
+ let n1,n2,l,p =
+ best_match frag.aseq a.aseq frag.abgw a.abgw
+ in
+ if p<1.0 then
+ let s1 = IntSet.elements frag.seqindex in
+ let s2 = IntSet.elements a.seqindex in
+ one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist)
+ else
+ one_block frag t outlist
+ else
+ one_block frag t outlist
+ in
+ match fraglist with
+ [] -> outlist
+ | a::t ->
+ let t1 =
+ List.filter (
+ fun f -> not (inconsistent_frags a f)) t in
+ make_block_list t (one_block a t1 outlist)
+ in
+
+
+ let fuse_block_list blocklist =
+
+ let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
+ if n1 >= (String.length frag1.aseq)
+ then
+ let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r'))
+ in
+ firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1,
+ frag2,n2,s2,l)
+ else if n2 >= (String.length frag2.aseq)
+ then
+ let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r'))
+ in
+ firstfrag (p,frag1,n1,s1,
+ frag2b,n2-(String.length frag2.aseq), s2,l)
+ else (p,frag1,n1,s1,frag2,n2,s2,l)
+ in
+
+ let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
+ let f1gap = ((String.length frag1.aseq) - n1) in
+ let f2gap = ((String.length frag2.aseq) - n2) in
+ if (f1gap < f2gap) then
+ let frag1b =
+ find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
+ let lb = l - f1gap in
+ (p,frag1,n1,s1,frag2,n2,s2,f1gap),
+ (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb)
+ else if (f1gap > f2gap) then
+ let frag2b =
+ find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
+ let lb = l - f2gap in
+ (p,frag1,n1,s1,frag2,n2,s2,f2gap),
+ (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb)
+ else
+ let frag1b =
+ find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
+ let frag2b =
+ find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
+ let lb = l - f1gap in
+ (p,frag1,n1,s1,frag2,n2,s2,f1gap),
+ (p,frag1b,0,s1,frag2b,0,s2,lb)
+ in
+
+ let rec expand_block pair1 outlist =
+ let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in
+ (*** if the block, as annotated, is no longer in one fragment because
+ *** of other fusings, then convert this block into a list of
+ *** single-fragment blocks *)
+ if inconsistent_frags frag1 frag2
+ then []
+ else
+ if String.length (frag1.aseq) > n1+l-1 &&
+ String.length (frag2.aseq) > n2+l-1
+ then (pair1::outlist)
+ else
+ if frag1=seqhead || frag2=seqhead then
+ (print_endline "problem";
+ outlist)
+ else
+ let pair1,pair2 = nextfrag pair1 in
+ expand_block pair2 (pair1::outlist)
+ in
+
+ let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) =
+ let consensus s1 s2 w1 w2 =
+ for i = 0 to (String.length s1) -1 do
+ if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.)
+ done;
+ s1, w1
+ in
+ (*** fuse the two frags; it is assumed that expand_block has
+ *** already been called so n1+l and n2+l won't exceed the length
+ *** of the frag *)
+ (*** divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5
+ *** where frag3 is the "shared piece" so the original frags are
+ *** split into frag1->frag3->frag4 and frag2->frag3->frag5 *)
+
+ (
+ let tempstr1=String.copy frag1.aseq in
+ let tempstr2=String.copy frag2.aseq in
+ let tempbgw1=Array.copy frag1.abgw in
+ let tempbgw2=Array.copy frag2.abgw in
+ (
+ frag1.aseq <- String.sub tempstr1 0 n1 ;
+ frag2.aseq <- String.sub tempstr2 0 n2 ;
+ frag1.abgw <- Array.sub tempbgw1 0 n1;
+ frag2.abgw <- Array.sub tempbgw2 0 n2;
+ let frag3 =
+ let as3, aw3 =
+ consensus (String.sub tempstr1 n1 l)
+ (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l)
+ (Array.sub tempbgw2 n2 l)
+ in
+ { header = "";
+ aseq = as3;
+ abgw = aw3;
+ seq = Hashtbl.copy frag1.seq;
+ seqindex=IntSet.union frag1.seqindex frag2.seqindex;
+ seqlabel= Hashtbl.copy frag1.seqlabel;
+ neighbours=Hashtbl.copy frag1.neighbours;
+ limits = Hashtbl.copy frag1.limits
+ } in
+ let frag4 = { header = "";
+ aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l);
+ abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l);
+ seq = Hashtbl.copy frag1.seq;
+ seqindex= IntSet.union frag1.seqindex IntSet.empty;
+ seqlabel= Hashtbl.copy frag1.seqlabel;
+ neighbours= Hashtbl.copy frag1.neighbours;
+ limits = Hashtbl.copy frag1.limits
+ } in
+ let frag5 = { header = "";
+ aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l);
+ abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l);
+ seq = Hashtbl.copy frag2.seq;
+ seqindex=IntSet.union frag2.seqindex IntSet.empty;
+ seqlabel= Hashtbl.copy frag2.seqlabel;
+ neighbours= Hashtbl.copy frag2.neighbours;
+ limits = Hashtbl.copy frag2.limits
+ }
+ in (
+ IntSet.iter
+ (fun n -> (Hashtbl.replace frag3.neighbours (n,'l')
+ (Frag frag1);
+ Hashtbl.replace frag3.neighbours (n,'r')
+ (Frag frag4);
+ Hashtbl.replace frag1.neighbours (n,'r')
+ (Frag frag3);
+ Hashtbl.replace frag4.neighbours (n,'l')
+ (Frag frag3);
+
+ Hashtbl.replace frag3.seq n
+ (String.sub (Hashtbl.find frag1.seq n) n1 l);
+ Hashtbl.replace frag4.seq n
+ (String.sub (Hashtbl.find frag1.seq n)
+ (n1+l) ((String.length tempstr1)-n1-l));
+ Hashtbl.replace frag1.seq n
+ (String.sub (Hashtbl.find frag1.seq n) 0 n1);
+
+ Hashtbl.replace frag3.seqlabel n
+ ((Hashtbl.find frag1.seqlabel n) ^ "1");
+ Hashtbl.replace frag4.seqlabel n
+ ((Hashtbl.find frag4.seqlabel n) ^ "2");
+ Hashtbl.replace frag1.seqlabel n
+ ((Hashtbl.find frag1.seqlabel n) ^ "0");
+ )) frag1.seqindex;
+ IntSet.iter
+ (fun n -> (Hashtbl.replace frag3.neighbours (n,'l')
+ (Frag frag2);
+ Hashtbl.replace frag3.neighbours (n,'r')
+ (Frag frag5);
+ Hashtbl.replace frag2.neighbours (n,'r')
+ (Frag frag3);
+ Hashtbl.replace frag5.neighbours (n,'l')
+ (Frag frag3);
+
+ Hashtbl.replace frag3.seq n
+ (String.sub (Hashtbl.find frag2.seq n) n2 l);
+ (try
+ Hashtbl.replace frag5.seq n
+ (String.sub (Hashtbl.find frag2.seq n)
+ (n2+l) ((String.length tempstr2)-n2-l))
+ with
+ _ ->
+ print_endline (Hashtbl.find frag2.seq n)
+ );
+ Hashtbl.replace frag2.seq n
+ (String.sub (Hashtbl.find frag2.seq n) 0 n2);
+
+ Hashtbl.replace frag3.seqlabel n
+ ((Hashtbl.find frag2.seqlabel n) ^ "1");
+ Hashtbl.replace frag5.seqlabel n
+ ((Hashtbl.find frag5.seqlabel n) ^ "2");
+ Hashtbl.replace frag2.seqlabel n
+ ((Hashtbl.find frag2.seqlabel n) ^ "0")
+ )) frag2.seqindex;
+
+ seqset := SeqSet.add frag3 !seqset;
+ seqset := SeqSet.add frag4 !seqset;
+ seqset := SeqSet.add frag5 !seqset;
+ update_limits frag3;
+ )
+ )
+ )
+ in
+
+ let fuse_one_frag f =
+ let expanded = expand_block (firstfrag f) [] in
+ if (List.for_all
+ (fun (p,frag1,n1,s1,frag2,n2,s2,l) ->
+ not (inconsistent_frags frag1 frag2))
+ expanded)
+ then
+ List.iter fuse_one_block expanded
+ in
+ List.iter fuse_one_frag blocklist
+ in
+ let fraglist = SeqSet.elements !seqset in
+ let blocklist =List.sort compare (make_block_list fraglist []) in
+ match blocklist with
+ [] -> []
+ | _ -> (fuse_block_list blocklist;
+ let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in
+ blockseq seql seqnumset maxl maxw)
+;;
+
+(*** ***)
+(*** Create aligned sequence out of the bunch of fragments ***)
+(*** produced by blockseq ***)
+(*** ***)
+
+
+let rec assemble_frags_to_seqs seqhead seqlist foutlist =
+
+ let is_complete frag = (* if frag is multi-sequence, all copies
+ *** are at head of seqlist *)
+ if frag == seqhead then false else
+ if IntSet.cardinal frag.seqindex = 1 then true else
+ let nfragmatches =
+ List.fold_left
+ (fun n fr ->
+ if IntSet.compare frag.seqindex fr.seqindex = 0
+ then n+1 else n) 0 seqlist
+ in
+ if nfragmatches = IntSet.cardinal frag.seqindex
+ then true else false
+ in
+
+ let no_other_blocks frag = (* No other complete frag has a smaller
+ *** seqindex in seqlist *)
+ if IntSet.cardinal frag.seqindex = 1 then true else
+ List.fold_left
+ (fun b fr ->
+ if (IntSet.cardinal fr.seqindex > 0) &&
+ (IntSet.compare fr.seqindex frag.seqindex = -1) &&
+ is_complete fr
+ then false else b) true seqlist
+ in
+
+ let maxlen fragwidth seqindex = (* For a complete multi-sequence
+ *** frag, it must be aligned when
+ *** tacked on, and can't overlap an
+ *** existing sequence; so find
+ *** "padding" length *)
+ let maxlen_blockseqs =
+ IntSet.fold (fun idx n ->
+ let fout = (List.nth foutlist idx) in
+ if Buffer.length fout.fseq > n
+ then Buffer.length fout.fseq else n) seqindex 0
+ in
+ let push_up fs m =
+ let m1 = ref m in
+ (while
+ (fs.[!m1] =Char.lowercase fs.[!m1]) do
+ m1 := !m1 + 1
+ done;
+ while (!m1 < String.length fs)
+ &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do
+ m1 := !m1 + 1
+ done;
+ !m1)
+ in
+ let rec overlap fs m1 n =
+ if n = fragwidth then false
+ else if (m1+n) >= String.length fs then false
+ else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true
+ else overlap fs m1 (n+1)
+ in
+ let mx = ref maxlen_blockseqs in
+ ( for n = 0 to (List.length foutlist - 1) do
+ let fs = Buffer.contents (List.nth foutlist n).fseq in
+ (while overlap fs !mx 0 do
+ mx := push_up fs !mx
+ done )
+ done;
+ !mx )
+ in
+
+ let find_neigh neigh1 =
+ match neigh1 with
+ None -> seqhead
+ | Frag a -> a
+ in
+
+ let rec process_frag_list seql foutl newseqlist n oldmaxlen =
+ match seql with
+ [] -> List.rev newseqlist
+ | frag::seqt ->
+ let fragseq = Hashtbl.find frag.seq n in
+ let fout = List.hd foutl in
+ if (is_complete frag) && (no_other_blocks frag) then
+ ( if IntSet.cardinal frag.seqindex = 1 then
+ (Buffer.add_string fout.fseq fragseq;
+ let nextfrag = find_neigh
+ (Hashtbl.find frag.neighbours (n,'r')) in
+ process_frag_list seqt (List.tl foutl)
+ (nextfrag::newseqlist) (n+1) oldmaxlen
+ )
+ else (
+ if oldmaxlen = 0 &&
+ (Buffer.length fout.fseq > 0 ||
+ n = List.hd (IntSet.elements frag.seqindex)) then (
+ (*** Find padding value, unless we're just starting ***)
+ (*** If just starting, make sure this is the first ***)
+ (*** in the block ***)
+ let newmaxlen = (maxlen (String.length fragseq)
+ frag.seqindex) in
+ while Buffer.length fout.fseq < newmaxlen do
+ Buffer.add_char fout.fseq '-'
+ done
+ ) else
+ while Buffer.length fout.fseq < oldmaxlen do
+ Buffer.add_char fout.fseq '-'
+ done;
+ Buffer.add_string fout.fseq (String.uppercase fragseq);
+ let nextfrag
+ = find_neigh (Hashtbl.find frag.neighbours (n,'r'))
+ in
+ process_frag_list seqt (List.tl foutl)
+ (nextfrag::newseqlist) (n+1)
+ ((Buffer.length fout.fseq) - (String.length fragseq))
+ )
+ )
+ else
+ process_frag_list seqt (List.tl foutl)
+ (frag::newseqlist) (n+1) oldmaxlen
+ in
+ let newseqlist = process_frag_list seqlist foutlist [] 0 0
+ in
+ let end_list =
+ List.fold_left
+ (fun b fr -> if fr==seqhead then b else false) true newseqlist
+ in
+ if end_list
+ then foutlist
+ else
+ assemble_frags_to_seqs seqhead newseqlist foutlist
+;;
+
+(*** ***)
+(*** ***)
+(*** ***)
+
+let print_aligned foutlist =
+ let print_one_line foutlist n poslist =
+ let print_one_line_seq fout pos =
+ let newpos = ref pos in
+ for m = 0 to 49 do
+ ( ( if m mod 50 = 0 then
+ let head =
+ if String.length fout.fheader < 11
+ then String.sub fout.fheader 1
+ (String.length fout.fheader -1)
+ else String.sub fout.fheader 1 10
+ in
+ ( print_string head;
+ (if (String.length head < 10) then
+ print_string
+ (String.make (10-(String.length head)) ' '));
+ print_string " "
+ )
+ else if ((m) mod 10) = 0 then
+ print_string " "
+ );
+ (let ch =
+ if n+m < Buffer.length fout.fseq
+ then (Buffer.contents fout.fseq).[m+n]
+ else ' ' in
+ (print_char ch;
+ if ch = '-' or ch = ' ' then
+ ()
+ else
+ newpos := !newpos+1
+ )
+ );
+ if ((m+1) mod 50) = 0 then (
+ print_string " ";
+ print_int !newpos;
+ print_string "\n";
+ ))
+ done;
+ !newpos
+ in
+ List.map2 print_one_line_seq foutlist poslist
+ in
+
+ let rec print_aligned foutlist n poslist =
+ (print_endline "";
+ if (List.fold_left (fun b f ->
+ if n < (Buffer.length f.fseq) then true else b)
+ false foutlist) then
+ print_aligned foutlist (n+50) (print_one_line foutlist n poslist)
+ else ()
+ )
+ in
+ let poslist = List.map (fun f -> 0) foutlist
+ in print_aligned foutlist 0 poslist;;
+
+(*** ***)
+(*** ***)
+(*** ***)
+
+let print_fasta foutlist =
+ List.iter
+ (fun f ->
+ print_endline f.fheader;
+ print_endline (Buffer.contents f.fseq);
+ print_string "\n"
+ ) foutlist ;;
+
+(*** ***)
+(*** ***)
+(*** ***)
+(*** ***)
+
+(*** main program below *)
+
+let filenameset = ref [] ;;
+let outtype = ref "";;
+
+let argspeclist =
+ [("-A", Arg.Unit
+ (fun () -> outtype := !outtype^"A"),
+ "aligned output (compare -F option) (default: only this)");
+ ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which "
+ ^ "to read background sequences (overridden by -B)");
+ ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities");
+ ("-F", Arg.Unit
+ (fun () -> outtype := !outtype^"F"),
+ "fasta output (can use both -A and -F in either order)");
+ ("-n", Arg.Set_int correlbg, "background correlation (default 2="
+ ^ "dinucleotide;\n 1=single-site basecounts, 0=0.25 per base)");
+ ("-x", Arg.Set_float prand, "set limit for how probable the match "
+ ^ "is by chance\n (default 0.002, smaller=more stringent)")]
+in
+let message =
+ String.concat ""
+ ["Sigma, version 1.0: simple greedy multiple alignment\n";
+ "Copyright (C) 2006 Rahul Siddharthan <rsidd at imsc.res.in>\n";
+ "Licensed under the GNU General Public License, version 2\n\n";
+ "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n";
+ "Each fasta file may contain a single sequence or multiple sequences;\n";
+ "all sequences will be aligned together.\n\n";
+ "Options:"]
+in
+ Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;;
+
+
+if !outtype = "" then outtype := "A";;
+
+
+if !filenameset = [] then
+ (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n";
+ exit 1);;
+
+
+if !correlbg > 2 then
+ (print_endline "-n > 2 not implemented; using 2";
+ correlbg := 2)
+else if !correlbg < 0 then
+ (print_endline "-n should be >= 0; assuming 0";
+ correlbg := 0)
+;;
+
+let gotbgprobs = ref (!correlbg = 0);;
+
+if !bgfilename > "" then
+ try
+ ( getbgprobs_from_bgfile !bgfilename;
+ normalise_bgprobs ();
+ gotbgprobs := true
+ )
+ with _ ->
+ Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename
+;;
+
+if (not !gotbgprobs) then (
+ let auxseqlist =
+ match !auxfilename with
+ "" -> []
+ | _ ->
+ try readfasta !auxfilename with
+ _ -> (Printf.fprintf stderr
+ "Couldn't open file %s : using input sequences\n"
+ !auxfilename;
+ [])
+ in
+ if auxseqlist != [] then
+ ( getbgprobs auxseqlist;
+ normalise_bgprobs () ;
+ gotbgprobs := true
+ )
+)
+;;
+
+let seqlist = filename2seqlist !filenameset !gotbgprobs ;;
+
+match List.length seqlist with
+ 1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1)
+ | 2 -> (Printf.fprintf stderr
+ "Only one sequence read, no alignment needed\n"; exit 0)
+ | _ -> ()
+;;
+
+let nel=(List.length seqlist) - 2 in
+ for n = 0 to nel do
+ let h = List.nth seqlist (n+1) in
+ ( h.seqindex <- IntSet.add n IntSet.empty ;
+ Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist));
+ Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist));
+ Hashtbl.replace h.seqlabel n "0.";
+ Hashtbl.replace (List.hd seqlist).seqlabel n "-1";
+ Hashtbl.replace h.seq n (String.copy h.aseq);
+ (* h.abgw <- getwts h.aseq; *)
+ (* h.abgw <- Array.create (String.length h.aseq) 0.0; *)
+ Hashtbl.replace (List.hd seqlist).seq n "";
+ )
+ done;;
+
+seqset := makefastaset seqlist;
+
+let seqnumset =
+ let rec range n r =
+ if n<0 then r else range (n-1) (IntSet.add n r)
+ in range (List.length seqlist - 2) IntSet.empty
+in blockseq seqlist seqnumset 10000 0.;;
+
+let fout = List.map (fun fr -> {fheader=fr.header;
+ fseq = Buffer.create 1000} )
+ (List.tl seqlist) ;;
+
+assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;;
+
+String.iter (fun c -> if c='A' then print_aligned fout else
+ print_fasta fout) !outtype;;
More information about the debian-med-commit
mailing list