[med-svn] r251 - in trunk/packages/sigma-align/trunk: . debian
Charles Plessy
charles-guest at alioth.debian.org
Sat Apr 7 06:46:41 UTC 2007
Author: charles-guest
Date: 2007-04-07 06:46:41 +0000 (Sat, 07 Apr 2007)
New Revision: 251
Removed:
trunk/packages/sigma-align/trunk/debian/lintian-override/
trunk/packages/sigma-align/trunk/sigma.ml
Log:
purging ocaml tings
Deleted: trunk/packages/sigma-align/trunk/sigma.ml
===================================================================
--- trunk/packages/sigma-align/trunk/sigma.ml 2007-04-07 06:44:15 UTC (rev 250)
+++ trunk/packages/sigma-align/trunk/sigma.ml 2007-04-07 06:46:41 UTC (rev 251)
@@ -1,1071 +0,0 @@
-(*** SiGMA : Simple Greedy Multiple Alignment
- *** Version 1.0 (March 11, 2006)
- *** Copyright (C) Rahul Siddharthan, 2006
- ***
- *** Requires ocaml 3.x (http://caml.inria.fr)
- *** When compiling, link the str module as follows:
- *** bytecode compiler -> ocamlc str.cma sigma.ml -o sigma.bytecode
- *** native-code compiler -> ocamlopt str.cmxa sigma.ml -o sigma
- ***
- *** (native is around 7x-10x faster, but no debugger support)
- ***
- *** To produce a static binary on linux (so that it's independent of
- *** your machine's particular version of libc), use
- *** ocamlopt -cc "gcc -static" str.cmxa sigma.ml -o sigma
- ***)
-
-(*** Strategy: find most significant common substring between two strings
- *** (mutations may exist but are taken care of in the scoring)
- *** and "align" them. Then next largest common substring among
- *** existing fragments that may be "consistently" aligned. Each time
- *** a common substring is found it is "fused" into one fragment (with
- *** multiple associated sequence numbers). Repeat until nothing left.
- ***
- *** Actually, for reasons of efficiency a list of "best matches" is made
- *** and aligned with consistency checks, rather than doing it one
- *** match at a time, but that's a detail.
- ***)
-
-module IntSet = Set.Make (struct type t=int
- let compare=compare end )
-
-(*** the main data type, the sequence fragment ***)
-type neigh_var = None | Frag of seq_frag
-and seq_frag = { header : string ;
- seq : (int, string) Hashtbl.t ;
- mutable aseq: string;
- mutable abgw: float array;
- mutable seqindex : IntSet.t;
- seqlabel: (int, string) Hashtbl.t;
- neighbours: (int * char, neigh_var) Hashtbl.t;
- limits : (int * char, string) Hashtbl.t }
-
-(*** Notes:
- ***
- *** 1. seq is a hashtbl keyed by sequence index n, giving corresponding
- *** sequence, since after alignment there may be mismatches.
- ***
- *** 2. aseq is a string giving the consensus of all aligned strings.
- *** For now, a mismatch will be marked as N and subsequent bases
- *** aligned to it will get a score zero.
- ***
- *** 3. seqindex is the original sequence to which the current sequence
- *** fragment belongs; a set containing initially a single number, but,
- *** as pairs are made, possibly multiple numbers
- ***
- *** 4. limits is a hashtbl keyed with pairs of sequence numbers and chars
- *** (n,c='l'/'r') mapping to the legal start (l) or end (r) position on
- *** sequence n for this fragment.
- ***
- *** 5. seqlabel is an array whose nth element is the label of that
- *** fragment on sequence n, a string representation of a float between 0
- *** and 1 (actually between 0 and 0.3). Each fragment on each sequence
- *** will have a unique label.
- ***
- *** 6. neighbours is a hashtbl that takes as keys a pair of the
- *** sequence number and a char 'l'/'r' and outputs a ref to the
- *** neighbouring frag on that seq in that direction
- ***
- *** 7. abgw is the background weight, the log of the (conditional)
- *** background probability for that base divided by 0.25.
- ***
- *** The sequence starts in two datastructures, a list (whose first
- *** element is a dummy) and a set that contains all the "real" elements.
- *** At each alignment step two sequences (elements of the set) are
- *** broken up into 5 sequences (a shared portion and the non-shared
- *** boundaries, which may be zero-length). The original list always
- *** contains the starting element, with pointer to the next fragment.
- *** It's like a doubly-linked list (actually, a set of lists with
- *** crosslinks, if you like) with end fragments pointing to a
- *** "dummy" fragment.
- ***)
-
-type fasta_seq = { fheader : string; mutable fseq: Buffer.t}
-
-(*** Global variables ***)
-let prand = ref 0.002
-let correlbg = ref 2
-let bgfilename = ref ""
-let auxfilename = ref ""
-let singlesite = ref (Array.make 4 0.0)
-let dinucleotide = ref (Array.make_matrix 4 4 0.0)
-
-(*** ***)
-(*** ***)
-(*** ***)
-
-(*** best_match: finds the most significant common substring with
- *** mismatches. The significance is basically the background
- *** probability of the string multiplied by the probability of
- *** seeing a match of that quality in two long strings of the given
- *** lengths. The probability for fragment size l with string lengths
- *** l1 and l2 is 1-(1-p)^[(l1-l+1)(l2-l+1)] = approx. p*(l1-l+1)*(l2-l+1)
- *** where p = bg prob * (l choose m), where m = number of mismatches.
- *** Lowest answer = most significant.
- ***)
-
-let best_match s1 s2 w1 w2 =
- let sl1 = String.length s1 in
- let sl2 = String.length s2 in
- let fmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 1.0 in
- let lmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
- let mmat = Array.make_matrix (sl1 + 1) (sl2 + 1) 0 in
- let best = ref 1.0 in
- let bestm = ref 0 in
- let bestn = ref 0 in
- let lbest = ref 0 in
- for m = 1 to sl1 do
- for n = 1 to sl2 do
- let a1 = s1.[m-1] in
- let b1 = s2.[n-1] in
- let f_old = fmat.(m-1).(n-1) in
- let m_old = mmat.(m-1).(n-1) in
- let l_new = lmat.(m-1).(n-1) + 1 in
- let m_new, f_new =
- if (a1=b1) && (a1 != 'N')
- then
- m_old, f_old *. sqrt(w1.(m-1)*.w2.(n-1))
- *. (float l_new) /. (float (l_new-m_old))
- else m_old + 1, f_old *. (float l_new) /. (float (m_old + 1))
- in
- let (f_new, l_new, m_new) = (min (f_new,l_new,m_new) (1.0,0,0)) in
- ( fmat.(m).(n) <- f_new;
- lmat.(m).(n) <- l_new;
- mmat.(m).(n) <- m_new;
- if (f_new < !best) then
- ( best := f_new; bestm := m; bestn := n)
- )
- done
- done;
- while (fmat.(!bestm - !lbest).(!bestn - !lbest) < 1.0)
- && (!lbest < !bestm) && (!lbest < !bestn) do
- lbest := !lbest + 1
- done;
- best := !best *. (float (sl1 - !lbest+1)) *. (float (sl2 - !lbest+1));
- if (!best < !prand)
- then !bestm - !lbest, !bestn - !lbest, !lbest, !best
- else 0,0,0,1.0
-;;
-
-(*** ***)
-(*** ***)
-(*** ***)
-
-let compareSeq seq1 seq2 =
- (*** if seqindex is same in both cases then check fragment number
- *** from "seqlabel", otherwise IntSet.compare *)
- (*** this is needed to create the module SeqSet, but isn't used,
- *** except to the extent that this should not return 0 unless
- *** two frags really are identical, which should not happen *)
- match (IntSet.compare seq1.seqindex seq2.seqindex ) with
- 0 -> ( let rec complabel = function
- | [] -> 0
- | a::t ->
- match (compare (Hashtbl.find seq1.seqlabel a)
- (Hashtbl.find seq2.seqlabel a)) with
- 0 -> complabel t
- | n -> n
- in complabel (IntSet.elements seq1.seqindex))
- | n -> n ;;
-
-
-module SeqSet = Set.Make (struct type t=seq_frag
- let compare=compareSeq end );;
-
-let seqset = ref SeqSet.empty ;;
-
-(*** ***)
-(*** ***)
-(*** ***)
-
-let readlines_noblank ffilename = (* works like python's readlines *)
- let fchan = open_in ffilename in
- let lenstr = in_channel_length fchan in
- let res = String.create lenstr in
- really_input fchan res 0 lenstr ;
- Str.split (Str.regexp "\\(\r\n\\|\n\\)+" ) res ;;
-
-let newfseq h = { header = h;
- seq = Hashtbl.create 5;
- aseq = "" ;
- abgw = Array.create 0 0.0;
- seqindex = IntSet.empty;
- seqlabel = Hashtbl.create 5;
- neighbours = Hashtbl.create 10;
- limits = Hashtbl.create 5 };;
-
-let readfasta filename = (* reads a fasta sequence, returns it as a list
- *** with a dummy first element used as a marker *)
- let flines = readlines_noblank filename in
- let rec flines2flist flist flines fcurr ln =
- if List.length flines = 0 then
- fcurr::flist
- else
- match (List.hd flines).[0] with
- '>' ->
- let fcurr2 = newfseq (List.hd flines)
- in flines2flist (fcurr::flist) (List.tl flines) fcurr2 (ln+1)
- | ';'|'#' -> flines2flist flist (List.tl flines) fcurr ln
- | _ ->
- let fcurr2 = { header = fcurr.header; seq=fcurr.seq;
- aseq = fcurr.aseq ^ (String.lowercase
- (List.hd flines)) ;
- abgw = Array.create 0 0.0;
- seqlabel= fcurr.seqlabel ;
- neighbours = fcurr.neighbours;
- seqindex = fcurr.seqindex;
- limits = fcurr.limits
- }
- in flines2flist flist (List.tl flines) fcurr2 ln
- in
- List.rev (flines2flist [] flines (newfseq "" ) (-1));;
-
-
-let makefastaset seqlist =
- let rec list2set a s =
- match a with
- [] -> s
- | h::t -> list2set t (SeqSet.add h s)
- in list2set (List.tl seqlist) (SeqSet.empty);;
-
-
-let basenum = function
- | 'a' | 'A' -> 0
- | 'c' | 'C' -> 1
- | 'g' | 'G' -> 2
- | 't' | 'T' -> 3
- | _ -> -1
-;;
-
-
-let getbgprobs_from_bgfile bgfile = (* read singlesite and dinucleotide
- from bgfile *)
- let bglines = readlines_noblank bgfile in
- let line2val line1 =
- let line1s=Str.split (Str.regexp "[ \t]+") line1 in
- if line1s > [] && line1.[0]!= '#' && (String.length (List.hd line1s) < 3)
- then
- let bs = List.hd line1s in
- if (String.length bs = 1) then
- !singlesite.(basenum bs.[0]) <-
- (float_of_string (List.nth line1s 1))
- else
- !dinucleotide.(basenum bs.[0]).(basenum bs.[1]) <-
- (float_of_string (List.nth line1s 1))
- else ()
- in
- List.iter line2val bglines;;
-
-
-let getbgprobs auxseqlist =
- (*** assume !singlesite, !dinucleotide are initialised to zero ***)
- List.iter
- (fun f ->
- for n = 0 to (String.length f.aseq) - 1 do
- let c = basenum f.aseq.[n] in
- if c >= 0 then
- (!singlesite.(c) <- !singlesite.(c) +. 1.0;
- !singlesite.(3-c) <- !singlesite.(3-c) +. 1.0;
- if n>0 then
- let c1 = basenum f.aseq.[n-1] in
- if c1 >=0 then
- (!dinucleotide.(c1).(c) <- !dinucleotide.(c1).(c) +. 1.0;
- )
- )
- done
- ) auxseqlist
-;;
-
-
-let normalise_bgprobs () =
- (let tots = ref 0.0 in
- (for n = 0 to 3 do
- tots := !tots +. !singlesite.(n)
- done;
- for n = 0 to 3 do
- !singlesite.(n) <- !singlesite.(n) /. !tots
- done
- )
- );
- for n = 0 to 3 do
- let totd = ref 0.0 in
- (for m = 0 to 3 do
- totd := !totd +. !dinucleotide.(n).(m)
- done;
- for m = 0 to 3 do
- !dinucleotide.(n).(m) <- !dinucleotide.(n).(m) /. !totd
- done
- )
- done
-;;
-
-let getwts s =
- let w = Array.make (String.length s) 0.0 in
- ( for n = 0 to (String.length s) - 1 do
- if !correlbg = 0 then w.(n) <- 0.25
- else
- let cn = basenum s.[n] in
- if cn<0 then w.(n) <- 1.
- else if n=0 then w.(n) <- !singlesite.(cn)
- else
- let cn1 = basenum s.[n-1] in
- if cn1 < 0 || !correlbg = 1
- then w.(n) <- !singlesite.(cn)
- else w.(n) <- !dinucleotide.(cn1).(cn)
- done;
- w
- )
-;;
-
-
-let filename2seqlist filenameset gotbgprobs =
- let fastalist1 =
- try
- readfasta (List.hd filenameset)
- with
- _ -> ( Printf.fprintf stderr
- "Error: unable to read file %s\n" (List.hd filenameset);
- exit 1)
- in
- let fastalist2l =
- List.map
- (fun filename ->
- try
- readfasta filename
- with
- _ -> ( Printf.fprintf stderr
- "Error: unable to read file %s\n" filename;
- exit 1)
- ) (List.tl filenameset)
- in
- let makeseqlist = function
- | [] -> []
- | h::t ->
- List.fold_left (fun n m -> n @ (List.tl m)) (List.tl h) t
- in
- let fastalist = fastalist1 @ (makeseqlist fastalist2l) in
- ( if gotbgprobs then () else
- ( getbgprobs fastalist;
- normalise_bgprobs ()
- );
- List.iter (fun f -> f.abgw <- getwts f.aseq) fastalist;
- fastalist
- )
-;;
-
-
-
-let rec blockseq seql seqnumset maxl maxw =
- (*** the main routine, recursively forms blocks until no more are
- *** possible *)
- (*** first, make a list of best pairwise matches, removing matches
- *** that conflict with better matches; then pairwise-align those,
- *** one by one; then call self again. If null list, quit. *)
-
- let seqhead = List.hd seql in
- let seqlist = List.tl seql in
-
- let find_neigh = function
- | None -> seqhead
- | Frag a -> a
- in
-
- let update_limits seqfrag = (* updates limits for consistent alignment *)
- let rec sweep_limits frag l_stop r_start m =
- let lim = seqfrag.limits in
- if frag==seqhead then ()
- else
- let nextfrag = find_neigh (Hashtbl.find frag.neighbours (m,'r')) in
- let thislabel = Hashtbl.find frag.seqlabel m in
- let fix_limits n =
- ( ( if thislabel <= l_stop then
- let newlim =
- try Hashtbl.find lim (n,'r')
- with _ -> "0.3"
- in
- let oldlim =
- try Hashtbl.find frag.limits (n,'r')
- with _ -> "0.3"
- in
- if oldlim > newlim
- then Hashtbl.replace frag.limits (n,'r') newlim);
- ( if thislabel >= r_start then
- let newlim =
- try Hashtbl.find lim (n,'l')
- with _ -> "0"
- in
- let oldlim =
- try Hashtbl.find frag.limits (n,'l')
- with _ -> "0"
- in
- if oldlim < newlim
- then Hashtbl.replace frag.limits (n,'l') newlim))
- in
- (IntSet.iter fix_limits seqnumset;
- sweep_limits nextfrag l_stop r_start m)
-
- in (
- IntSet.iter (* Set "diagonal" limits to self *)
- (fun n ->
- (Hashtbl.replace seqfrag.limits (n,'l')
- (Hashtbl.find seqfrag.seqlabel n) ;
- Hashtbl.replace seqfrag.limits (n,'r')
- (Hashtbl.find seqfrag.seqlabel n)))
- seqfrag.seqindex;
- IntSet.iter (* replace each limit with most stringent limit of
- *** a neighbouring fragment *)
- (fun n ->
- ( IntSet.iter
- (fun m ->
- (let fragl = find_neigh
- (Hashtbl.find seqfrag.neighbours (m,'l')) in
- let limn =
- try Hashtbl.find fragl.limits (n,'l')
- with _ -> "0"
- in
- let seqfraglim =
- try Hashtbl.find seqfrag.limits (n,'l')
- with _ -> "0"
- in
- if limn > seqfraglim
- then Hashtbl.replace seqfrag.limits (n,'l') limn );
- (let fragr = find_neigh
- (Hashtbl.find seqfrag.neighbours (m,'r')) in
- let limn =
- try Hashtbl.find fragr.limits (n,'r')
- with _ -> "0.3"
- in
- let seqfraglim =
- try Hashtbl.find seqfrag.limits (n,'r')
- with _ -> "0.3"
- in
- if limn < seqfraglim
- then Hashtbl.replace seqfrag.limits (n,'r') limn )
- ) seqfrag.seqindex )
- ) seqnumset;
- let do_sweep n =
- let initfrag = (List.nth seqlist n) in
- let llimit =
- try Hashtbl.find seqfrag.limits (n,'l')
- with _ -> "0"
- in
- let rlimit =
- try Hashtbl.find seqfrag.limits (n,'r')
- with _ -> "0.3"
- in
- sweep_limits initfrag llimit rlimit n
- in
- IntSet.iter do_sweep seqnumset
- )
- in
-
- let inconsistent_frags frag1 frag2 =
- let rec inconsistent_indices indexlist1 indexlist2 =
- (*** check that the frags don't lie outside allowed ranges for
- *** one another *)
- let rec inconsistent_indices_onepair index1 indexlist2 =
- match indexlist2 with
- [] -> false
- | h::t ->
- let lim2l =
- try Hashtbl.find frag2.limits (index1,'l')
- with _ -> "0"
- in
- let lim2r =
- try Hashtbl.find frag2.limits (index1,'r')
- with _ -> "0.3"
- in
- let lim1l =
- try Hashtbl.find frag1.limits (h,'l')
- with _ -> "0"
- in
- let lim1r =
- try Hashtbl.find frag1.limits (h,'r')
- with _ -> "0.3"
- in
- if (compare (Hashtbl.find frag1.seqlabel index1) lim2l <= 0)
- || (compare (Hashtbl.find frag1.seqlabel index1) lim2r >= 0)
- || (compare (Hashtbl.find frag2.seqlabel h ) lim1l <= 0)
- || (compare (Hashtbl.find frag2.seqlabel h) lim1r >= 0)
- then true
- else inconsistent_indices_onepair index1 t
- in
- match indexlist1 with
- [] -> false
- | h::t ->
- if inconsistent_indices_onepair h indexlist2
- then true
- else inconsistent_indices t indexlist2
- in inconsistent_indices (IntSet.elements frag1.seqindex)
- (IntSet.elements frag2.seqindex)
- in
-
-
- let rec make_block_list fraglist outlist =
- (*** make a list of best matches, to be traversed by fuse_block_list *)
- (*** note: at least one of the frags being paired must have
- *** "paired"=true *)
- let rec one_block frag rest_of_list outlist =
- match rest_of_list with
- [] -> outlist
- | a::t ->
- if
- (frag.aseq != "") && (a.aseq != "")
- then
- let n1,n2,l,p =
- best_match frag.aseq a.aseq frag.abgw a.abgw
- in
- if p<1.0 then
- let s1 = IntSet.elements frag.seqindex in
- let s2 = IntSet.elements a.seqindex in
- one_block frag t ((p,frag,n1,s1,a,n2,s2,l)::outlist)
- else
- one_block frag t outlist
- else
- one_block frag t outlist
- in
- match fraglist with
- [] -> outlist
- | a::t ->
- let t1 =
- List.filter (
- fun f -> not (inconsistent_frags a f)) t in
- make_block_list t (one_block a t1 outlist)
- in
-
-
- let fuse_block_list blocklist =
-
- let rec firstfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
- if n1 >= (String.length frag1.aseq)
- then
- let frag1b= find_neigh (Hashtbl.find frag1.neighbours (List.hd s1,'r'))
- in
- firstfrag (p,frag1b,n1-(String.length frag1.aseq), s1,
- frag2,n2,s2,l)
- else if n2 >= (String.length frag2.aseq)
- then
- let frag2b= find_neigh (Hashtbl.find frag2.neighbours (List.hd s2,'r'))
- in
- firstfrag (p,frag1,n1,s1,
- frag2b,n2-(String.length frag2.aseq), s2,l)
- else (p,frag1,n1,s1,frag2,n2,s2,l)
- in
-
- let nextfrag (p,frag1,n1,s1,frag2,n2,s2,l) =
- let f1gap = ((String.length frag1.aseq) - n1) in
- let f2gap = ((String.length frag2.aseq) - n2) in
- if (f1gap < f2gap) then
- let frag1b =
- find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
- let lb = l - f1gap in
- (p,frag1,n1,s1,frag2,n2,s2,f1gap),
- (p,frag1b,0,s1,frag2,n2+f1gap,s2,lb)
- else if (f1gap > f2gap) then
- let frag2b =
- find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
- let lb = l - f2gap in
- (p,frag1,n1,s1,frag2,n2,s2,f2gap),
- (p,frag1,n1+f2gap,s1,frag2b,0,s2,lb)
- else
- let frag1b =
- find_neigh (Hashtbl.find frag1.neighbours (List.hd s1, 'r')) in
- let frag2b =
- find_neigh (Hashtbl.find frag2.neighbours (List.hd s2, 'r')) in
- let lb = l - f1gap in
- (p,frag1,n1,s1,frag2,n2,s2,f1gap),
- (p,frag1b,0,s1,frag2b,0,s2,lb)
- in
-
- let rec expand_block pair1 outlist =
- let (p,frag1,n1,s1,frag2,n2,s2,l) = pair1 in
- (*** if the block, as annotated, is no longer in one fragment because
- *** of other fusings, then convert this block into a list of
- *** single-fragment blocks *)
- if inconsistent_frags frag1 frag2
- then []
- else
- if String.length (frag1.aseq) > n1+l-1 &&
- String.length (frag2.aseq) > n2+l-1
- then (pair1::outlist)
- else
- if frag1=seqhead || frag2=seqhead then
- (print_endline "problem";
- outlist)
- else
- let pair1,pair2 = nextfrag pair1 in
- expand_block pair2 (pair1::outlist)
- in
-
- let fuse_one_block (p,frag1,n1,s1,frag2,n2,s2,l) =
- let consensus s1 s2 w1 w2 =
- for i = 0 to (String.length s1) -1 do
- if s1.[i] != s2.[i] then ( s1.[i] <- 'N'; w1.(i) <- 0.)
- done;
- s1, w1
- in
- (*** fuse the two frags; it is assumed that expand_block has
- *** already been called so n1+l and n2+l won't exceed the length
- *** of the frag *)
- (*** divide frag1 and frag2 into frag1, frag2, frag3, frag4, frag5
- *** where frag3 is the "shared piece" so the original frags are
- *** split into frag1->frag3->frag4 and frag2->frag3->frag5 *)
-
- (
- let tempstr1=String.copy frag1.aseq in
- let tempstr2=String.copy frag2.aseq in
- let tempbgw1=Array.copy frag1.abgw in
- let tempbgw2=Array.copy frag2.abgw in
- (
- frag1.aseq <- String.sub tempstr1 0 n1 ;
- frag2.aseq <- String.sub tempstr2 0 n2 ;
- frag1.abgw <- Array.sub tempbgw1 0 n1;
- frag2.abgw <- Array.sub tempbgw2 0 n2;
- let frag3 =
- let as3, aw3 =
- consensus (String.sub tempstr1 n1 l)
- (String.sub tempstr2 n2 l) (Array.sub tempbgw1 n1 l)
- (Array.sub tempbgw2 n2 l)
- in
- { header = "";
- aseq = as3;
- abgw = aw3;
- seq = Hashtbl.copy frag1.seq;
- seqindex=IntSet.union frag1.seqindex frag2.seqindex;
- seqlabel= Hashtbl.copy frag1.seqlabel;
- neighbours=Hashtbl.copy frag1.neighbours;
- limits = Hashtbl.copy frag1.limits
- } in
- let frag4 = { header = "";
- aseq = String.sub tempstr1 (n1+l) ((String.length tempstr1)-n1-l);
- abgw = Array.sub tempbgw1 (n1+l) ((String.length tempstr1)-n1-l);
- seq = Hashtbl.copy frag1.seq;
- seqindex= IntSet.union frag1.seqindex IntSet.empty;
- seqlabel= Hashtbl.copy frag1.seqlabel;
- neighbours= Hashtbl.copy frag1.neighbours;
- limits = Hashtbl.copy frag1.limits
- } in
- let frag5 = { header = "";
- aseq = String.sub tempstr2 (n2+l) ((String.length tempstr2)-n2-l);
- abgw = Array.sub tempbgw2 (n2+l) ((String.length tempstr2)-n2-l);
- seq = Hashtbl.copy frag2.seq;
- seqindex=IntSet.union frag2.seqindex IntSet.empty;
- seqlabel= Hashtbl.copy frag2.seqlabel;
- neighbours= Hashtbl.copy frag2.neighbours;
- limits = Hashtbl.copy frag2.limits
- }
- in (
- IntSet.iter
- (fun n -> (Hashtbl.replace frag3.neighbours (n,'l')
- (Frag frag1);
- Hashtbl.replace frag3.neighbours (n,'r')
- (Frag frag4);
- Hashtbl.replace frag1.neighbours (n,'r')
- (Frag frag3);
- Hashtbl.replace frag4.neighbours (n,'l')
- (Frag frag3);
-
- Hashtbl.replace frag3.seq n
- (String.sub (Hashtbl.find frag1.seq n) n1 l);
- Hashtbl.replace frag4.seq n
- (String.sub (Hashtbl.find frag1.seq n)
- (n1+l) ((String.length tempstr1)-n1-l));
- Hashtbl.replace frag1.seq n
- (String.sub (Hashtbl.find frag1.seq n) 0 n1);
-
- Hashtbl.replace frag3.seqlabel n
- ((Hashtbl.find frag1.seqlabel n) ^ "1");
- Hashtbl.replace frag4.seqlabel n
- ((Hashtbl.find frag4.seqlabel n) ^ "2");
- Hashtbl.replace frag1.seqlabel n
- ((Hashtbl.find frag1.seqlabel n) ^ "0");
- )) frag1.seqindex;
- IntSet.iter
- (fun n -> (Hashtbl.replace frag3.neighbours (n,'l')
- (Frag frag2);
- Hashtbl.replace frag3.neighbours (n,'r')
- (Frag frag5);
- Hashtbl.replace frag2.neighbours (n,'r')
- (Frag frag3);
- Hashtbl.replace frag5.neighbours (n,'l')
- (Frag frag3);
-
- Hashtbl.replace frag3.seq n
- (String.sub (Hashtbl.find frag2.seq n) n2 l);
- (try
- Hashtbl.replace frag5.seq n
- (String.sub (Hashtbl.find frag2.seq n)
- (n2+l) ((String.length tempstr2)-n2-l))
- with
- _ ->
- print_endline (Hashtbl.find frag2.seq n)
- );
- Hashtbl.replace frag2.seq n
- (String.sub (Hashtbl.find frag2.seq n) 0 n2);
-
- Hashtbl.replace frag3.seqlabel n
- ((Hashtbl.find frag2.seqlabel n) ^ "1");
- Hashtbl.replace frag5.seqlabel n
- ((Hashtbl.find frag5.seqlabel n) ^ "2");
- Hashtbl.replace frag2.seqlabel n
- ((Hashtbl.find frag2.seqlabel n) ^ "0")
- )) frag2.seqindex;
-
- seqset := SeqSet.add frag3 !seqset;
- seqset := SeqSet.add frag4 !seqset;
- seqset := SeqSet.add frag5 !seqset;
- update_limits frag3;
- )
- )
- )
- in
-
- let fuse_one_frag f =
- let expanded = expand_block (firstfrag f) [] in
- if (List.for_all
- (fun (p,frag1,n1,s1,frag2,n2,s2,l) ->
- not (inconsistent_frags frag1 frag2))
- expanded)
- then
- List.iter fuse_one_block expanded
- in
- List.iter fuse_one_frag blocklist
- in
- let fraglist = SeqSet.elements !seqset in
- let blocklist =List.sort compare (make_block_list fraglist []) in
- match blocklist with
- [] -> []
- | _ -> (fuse_block_list blocklist;
- let (_,_,_,_,_,_,_,maxl) = List.hd blocklist in
- blockseq seql seqnumset maxl maxw)
-;;
-
-(*** ***)
-(*** Create aligned sequence out of the bunch of fragments ***)
-(*** produced by blockseq ***)
-(*** ***)
-
-
-let rec assemble_frags_to_seqs seqhead seqlist foutlist =
-
- let is_complete frag = (* if frag is multi-sequence, all copies
- *** are at head of seqlist *)
- if frag == seqhead then false else
- if IntSet.cardinal frag.seqindex = 1 then true else
- let nfragmatches =
- List.fold_left
- (fun n fr ->
- if IntSet.compare frag.seqindex fr.seqindex = 0
- then n+1 else n) 0 seqlist
- in
- if nfragmatches = IntSet.cardinal frag.seqindex
- then true else false
- in
-
- let no_other_blocks frag = (* No other complete frag has a smaller
- *** seqindex in seqlist *)
- if IntSet.cardinal frag.seqindex = 1 then true else
- List.fold_left
- (fun b fr ->
- if (IntSet.cardinal fr.seqindex > 0) &&
- (IntSet.compare fr.seqindex frag.seqindex = -1) &&
- is_complete fr
- then false else b) true seqlist
- in
-
- let maxlen fragwidth seqindex = (* For a complete multi-sequence
- *** frag, it must be aligned when
- *** tacked on, and can't overlap an
- *** existing sequence; so find
- *** "padding" length *)
- let maxlen_blockseqs =
- IntSet.fold (fun idx n ->
- let fout = (List.nth foutlist idx) in
- if Buffer.length fout.fseq > n
- then Buffer.length fout.fseq else n) seqindex 0
- in
- let push_up fs m =
- let m1 = ref m in
- (while
- (fs.[!m1] =Char.lowercase fs.[!m1]) do
- m1 := !m1 + 1
- done;
- while (!m1 < String.length fs)
- &&(fs.[!m1] <> Char.lowercase fs.[!m1]) do
- m1 := !m1 + 1
- done;
- !m1)
- in
- let rec overlap fs m1 n =
- if n = fragwidth then false
- else if (m1+n) >= String.length fs then false
- else if fs.[m1+n] <> Char.lowercase fs.[m1+n] then true
- else overlap fs m1 (n+1)
- in
- let mx = ref maxlen_blockseqs in
- ( for n = 0 to (List.length foutlist - 1) do
- let fs = Buffer.contents (List.nth foutlist n).fseq in
- (while overlap fs !mx 0 do
- mx := push_up fs !mx
- done )
- done;
- !mx )
- in
-
- let find_neigh neigh1 =
- match neigh1 with
- None -> seqhead
- | Frag a -> a
- in
-
- let rec process_frag_list seql foutl newseqlist n oldmaxlen =
- match seql with
- [] -> List.rev newseqlist
- | frag::seqt ->
- let fragseq = Hashtbl.find frag.seq n in
- let fout = List.hd foutl in
- if (is_complete frag) && (no_other_blocks frag) then
- ( if IntSet.cardinal frag.seqindex = 1 then
- (Buffer.add_string fout.fseq fragseq;
- let nextfrag = find_neigh
- (Hashtbl.find frag.neighbours (n,'r')) in
- process_frag_list seqt (List.tl foutl)
- (nextfrag::newseqlist) (n+1) oldmaxlen
- )
- else (
- if oldmaxlen = 0 &&
- (Buffer.length fout.fseq > 0 ||
- n = List.hd (IntSet.elements frag.seqindex)) then (
- (*** Find padding value, unless we're just starting ***)
- (*** If just starting, make sure this is the first ***)
- (*** in the block ***)
- let newmaxlen = (maxlen (String.length fragseq)
- frag.seqindex) in
- while Buffer.length fout.fseq < newmaxlen do
- Buffer.add_char fout.fseq '-'
- done
- ) else
- while Buffer.length fout.fseq < oldmaxlen do
- Buffer.add_char fout.fseq '-'
- done;
- Buffer.add_string fout.fseq (String.uppercase fragseq);
- let nextfrag
- = find_neigh (Hashtbl.find frag.neighbours (n,'r'))
- in
- process_frag_list seqt (List.tl foutl)
- (nextfrag::newseqlist) (n+1)
- ((Buffer.length fout.fseq) - (String.length fragseq))
- )
- )
- else
- process_frag_list seqt (List.tl foutl)
- (frag::newseqlist) (n+1) oldmaxlen
- in
- let newseqlist = process_frag_list seqlist foutlist [] 0 0
- in
- let end_list =
- List.fold_left
- (fun b fr -> if fr==seqhead then b else false) true newseqlist
- in
- if end_list
- then foutlist
- else
- assemble_frags_to_seqs seqhead newseqlist foutlist
-;;
-
-(*** ***)
-(*** ***)
-(*** ***)
-
-let print_aligned foutlist =
- let print_one_line foutlist n poslist =
- let print_one_line_seq fout pos =
- let newpos = ref pos in
- for m = 0 to 49 do
- ( ( if m mod 50 = 0 then
- let head =
- if String.length fout.fheader < 11
- then String.sub fout.fheader 1
- (String.length fout.fheader -1)
- else String.sub fout.fheader 1 10
- in
- ( print_string head;
- (if (String.length head < 10) then
- print_string
- (String.make (10-(String.length head)) ' '));
- print_string " "
- )
- else if ((m) mod 10) = 0 then
- print_string " "
- );
- (let ch =
- if n+m < Buffer.length fout.fseq
- then (Buffer.contents fout.fseq).[m+n]
- else ' ' in
- (print_char ch;
- if ch = '-' or ch = ' ' then
- ()
- else
- newpos := !newpos+1
- )
- );
- if ((m+1) mod 50) = 0 then (
- print_string " ";
- print_int !newpos;
- print_string "\n";
- ))
- done;
- !newpos
- in
- List.map2 print_one_line_seq foutlist poslist
- in
-
- let rec print_aligned foutlist n poslist =
- (print_endline "";
- if (List.fold_left (fun b f ->
- if n < (Buffer.length f.fseq) then true else b)
- false foutlist) then
- print_aligned foutlist (n+50) (print_one_line foutlist n poslist)
- else ()
- )
- in
- let poslist = List.map (fun f -> 0) foutlist
- in print_aligned foutlist 0 poslist;;
-
-(*** ***)
-(*** ***)
-(*** ***)
-
-let print_fasta foutlist =
- List.iter
- (fun f ->
- print_endline f.fheader;
- print_endline (Buffer.contents f.fseq);
- print_string "\n"
- ) foutlist ;;
-
-(*** ***)
-(*** ***)
-(*** ***)
-(*** ***)
-
-(*** main program below *)
-
-let filenameset = ref [] ;;
-let outtype = ref "";;
-
-let argspeclist =
- [("-A", Arg.Unit
- (fun () -> outtype := !outtype^"A"),
- "aligned output (compare -F option) (default: only this)");
- ("-b", Arg.Set_string auxfilename, "name of auxiliary file from which "
- ^ "to read background sequences (overridden by -B)");
- ("-B", Arg.Set_string bgfilename, "name of file containing background probabilities");
- ("-F", Arg.Unit
- (fun () -> outtype := !outtype^"F"),
- "fasta output (can use both -A and -F in either order)");
- ("-n", Arg.Set_int correlbg, "background correlation (default 2="
- ^ "dinucleotide;\n 1=single-site basecounts, 0=0.25 per base)");
- ("-x", Arg.Set_float prand, "set limit for how probable the match "
- ^ "is by chance\n (default 0.002, smaller=more stringent)")]
-in
-let message =
- String.concat ""
- ["Sigma, version 1.0: simple greedy multiple alignment\n";
- "Copyright (C) 2006 Rahul Siddharthan <rsidd at imsc.res.in>\n";
- "Licensed under the GNU General Public License, version 2\n\n";
- "Usage: sigma [options] inputfile.fasta [inputfile2.fasta ...]\n\n";
- "Each fasta file may contain a single sequence or multiple sequences;\n";
- "all sequences will be aligned together.\n\n";
- "Options:"]
-in
- Arg.parse argspeclist (fun s -> filenameset := !filenameset @ [s]) message;;
-
-
-if !outtype = "" then outtype := "A";;
-
-
-if !filenameset = [] then
- (Printf.fprintf stderr "Must specify name of file(s) in fasta format (or use -help)\n";
- exit 1);;
-
-
-if !correlbg > 2 then
- (print_endline "-n > 2 not implemented; using 2";
- correlbg := 2)
-else if !correlbg < 0 then
- (print_endline "-n should be >= 0; assuming 0";
- correlbg := 0)
-;;
-
-let gotbgprobs = ref (!correlbg = 0);;
-
-if !bgfilename > "" then
- try
- ( getbgprobs_from_bgfile !bgfilename;
- normalise_bgprobs ();
- gotbgprobs := true
- )
- with _ ->
- Printf.fprintf stderr "Couldn't read bgprobs from file %s\n" !bgfilename
-;;
-
-if (not !gotbgprobs) then (
- let auxseqlist =
- match !auxfilename with
- "" -> []
- | _ ->
- try readfasta !auxfilename with
- _ -> (Printf.fprintf stderr
- "Couldn't open file %s : using input sequences\n"
- !auxfilename;
- [])
- in
- if auxseqlist != [] then
- ( getbgprobs auxseqlist;
- normalise_bgprobs () ;
- gotbgprobs := true
- )
-)
-;;
-
-let seqlist = filename2seqlist !filenameset !gotbgprobs ;;
-
-match List.length seqlist with
- 1 -> (Printf.fprintf stderr "Error: no sequences read\n"; exit 1)
- | 2 -> (Printf.fprintf stderr
- "Only one sequence read, no alignment needed\n"; exit 0)
- | _ -> ()
-;;
-
-let nel=(List.length seqlist) - 2 in
- for n = 0 to nel do
- let h = List.nth seqlist (n+1) in
- ( h.seqindex <- IntSet.add n IntSet.empty ;
- Hashtbl.replace h.neighbours (n,'l') (Frag (List.hd seqlist));
- Hashtbl.replace h.neighbours (n,'r') (Frag (List.hd seqlist));
- Hashtbl.replace h.seqlabel n "0.";
- Hashtbl.replace (List.hd seqlist).seqlabel n "-1";
- Hashtbl.replace h.seq n (String.copy h.aseq);
- (* h.abgw <- getwts h.aseq; *)
- (* h.abgw <- Array.create (String.length h.aseq) 0.0; *)
- Hashtbl.replace (List.hd seqlist).seq n "";
- )
- done;;
-
-seqset := makefastaset seqlist;
-
-let seqnumset =
- let rec range n r =
- if n<0 then r else range (n-1) (IntSet.add n r)
- in range (List.length seqlist - 2) IntSet.empty
-in blockseq seqlist seqnumset 10000 0.;;
-
-let fout = List.map (fun fr -> {fheader=fr.header;
- fseq = Buffer.create 1000} )
- (List.tl seqlist) ;;
-
-assemble_frags_to_seqs (List.hd seqlist) (List.tl seqlist) fout ;;
-
-String.iter (fun c -> if c='A' then print_aligned fout else
- print_fasta fout) !outtype;;
More information about the debian-med-commit
mailing list