[med-svn] [Git][med-team/python-pangolearn][upstream] New upstream version 2022-07-09+dfsg

Andreas Tille (@tille) gitlab at salsa.debian.org
Fri Dec 23 15:51:42 GMT 2022



Andreas Tille pushed to branch upstream at Debian Med / python-pangolearn


Commits:
e41ea908 by Andreas Tille at 2022-12-23T16:26:18+01:00
New upstream version 2022-07-09+dfsg
- - - - -


18 changed files:

- + .gitignore
- pangoLEARN/__init__.py
- pangoLEARN/data/decisionTree_recall_report.csv
- pangoLEARN/data/decision_tree_rules.zip
- pangoLEARN/data/lineageTree.pb
- pangoLEARN/data/lineages.downsample.csv
- pangoLEARN/data/lineages.hash.csv
- pangoLEARN/data/randomForest_recall_report.csv
- + pangoLEARN/scripts/copy_files_push_to_branch.sh
- pangoLEARN/scripts/curate_alignment.smk
- pangoLEARN/scripts/training_runner.sh
- + pangoLEARN/training/get_lineage_positions.py
- − pangoLEARN/training/outgroups.csv
- pangoLEARN/training/pangoLEARNDecisionTree_v1.py
- + pangoLEARN/training/pangoLEARNRandomForest_v1.py
- pangoLEARN/training/processOutputFile.py
- + pangoLEARN/training/reference.fasta
- + pangoLEARN/training/utils.py


Changes:

=====================================
.gitignore
=====================================
@@ -0,0 +1,127 @@
+# Editors
+.vscode/
+.idea/
+
+# Vagrant
+.vagrant/
+
+# Mac/OSX
+.DS_Store
+
+# Windows
+Thumbs.db
+
+# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.snakemake/
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+


=====================================
pangoLEARN/__init__.py
=====================================
@@ -1,3 +1,7 @@
 _program = "pangoLEARN"
-__version__ = "2022-02-02"
-PANGO_VERSION = "v1.2.124"
+__version__ = "2022-07-09"
+PANGO_VERSION = "v1.12"
+
+__all__ = ["training"]
+
+from pangoLEARN import *


=====================================
pangoLEARN/data/decisionTree_recall_report.csv
=====================================
The diff for this file was not included because it is too large.

=====================================
pangoLEARN/data/decision_tree_rules.zip
=====================================
Binary files a/pangoLEARN/data/decision_tree_rules.zip and b/pangoLEARN/data/decision_tree_rules.zip differ


=====================================
pangoLEARN/data/lineageTree.pb
=====================================
The diff for this file was not included because it is too large.

=====================================
pangoLEARN/data/lineages.downsample.csv
=====================================
The diff for this file was not included because it is too large.

=====================================
pangoLEARN/data/lineages.hash.csv
=====================================
The diff for this file was not included because it is too large.

=====================================
pangoLEARN/data/randomForest_recall_report.csv
=====================================
The diff for this file was not included because it is too large.

=====================================
pangoLEARN/scripts/copy_files_push_to_branch.sh
=====================================
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+OUTDIR=$1
+PANGO_VERSION=$2
+PLEARN_VERSION=$3
+REPO_PATH=/localdisk/home/s1680070/repositories
+
+echo $PLEARN_VERSION
+echo $PANGO_VERSION
+
+cd $REPO_PATH/pangolin-data && git pull
+git checkout -b "origin/prerelease_$PANGO_VERSION" "remotes/origin/prerelease_$PANGO_VERSION" || git checkout -b "prerelease_$PANGO_VERSION"
+git pull
+
+cd $REPO_PATH/pangoLEARN && git pull
+git checkout "prerelease_$PLEARN_VERSION" || git checkout -b "prerelease_$PLEARN_VERSION"
+git pull
+
+cp $OUTDIR/pangolearn.init.py   $REPO_PATH/pangoLEARN/pangoLEARN/__init__.py
+cp $OUTDIR/pangolin_data.init.py   $REPO_PATH/pangolin-data/pangolin_data/__init__.py
+
+cp $OUTDIR/decisionTreeHeaders_v1.joblib   $REPO_PATH/pangoLEARN/pangoLEARN/data/decisionTreeHeaders_v1.joblib
+cp $OUTDIR/decisionTree_v1.joblib   $REPO_PATH/pangoLEARN/pangoLEARN/data/decisionTree_v1.joblib
+cp $OUTDIR/decision_tree_rules.zip   $REPO_PATH/pangoLEARN/pangoLEARN/data/decision_tree_rules.zip
+
+cp $OUTDIR/random*   $REPO_PATH/pangoLEARN/pangoLEARN/data/
+cp $OUTDIR/random*   $REPO_PATH/pangoLEARN/pangoLEARN/data/
+
+cp $OUTDIR/metadata.final.csv   $REPO_PATH/pangoLEARN/pangoLEARN/data/lineages.downsample.csv
+cp $OUTDIR/lineage.hash.csv   $REPO_PATH/pangoLEARN/pangoLEARN/data/lineages.hash.csv
+
+cp $OUTDIR/random*   $REPO_PATH/pangolin-data/pangolin_data/data/
+cp $OUTDIR/lineage.hash.csv   $REPO_PATH/pangolin-data/pangolin_data/data/lineages.hash.csv
+cp $REPO_PATH/pango-designation/pango_designation/alias_key.json   $REPO_PATH/pangolin-data/pangolin_data/data/
+
+
+cd $REPO_PATH/pangoLEARN && git pull
+
+cp $REPO_PATH/pangolin-data/pangolin_data/data/lineageTree.pb $REPO_PATH/pangoLEARN/pangoLEARN/data/lineageTree.pb
+
+git add $REPO_PATH/pangoLEARN/pangoLEARN/data/*
+git add $REPO_PATH/pangoLEARN/pangoLEARN/__init__.py
+git status
+
+git commit -m "adding latest decision tree and rf model to pangoLEARN repo for trained version $PLEARN_VERSION corresponding to $PANGO_VERSION"
+git push --set-upstream origin "prerelease_$PLEARN_VERSION"
+git checkout master
+
+cd $REPO_PATH/pangolin-data && git pull
+git status 
+git add $REPO_PATH/pangolin-data/pangolin_data/data/*
+git add $REPO_PATH/pangolin-data/pangolin_data/__init__.py
+git commit -m "adding latest hash, alias file and rf model corresponding to $PANGO_VERSION"
+git push --set-upstream origin "prerelease_$PANGO_VERSION"
+git push origin HEAD:"prerelease_$PANGO_VERSION"
+git checkout main


=====================================
pangoLEARN/scripts/curate_alignment.smk
=====================================
@@ -1,61 +1,46 @@
-import csv
-from Bio import SeqIO
 import os
+import sys
+sys.path.insert(0, '/localdisk/home/s1680070/repositories/pangoLEARN')
+
+from Bio import SeqIO
+
 import collections
 import hashlib
 import collections
 import csv
+from pangoLEARN.training.get_lineage_positions import get_relevant_positions
+from pangoLEARN.training.utils import *
+
 from Bio import SeqIO
-from pangoLEARN.training import downsample
 from datetime import date
 today = date.today()
+csv.field_size_limit(sys.maxsize)
 
-def get_hash_string(record):
-    seq = str(record.seq).upper().encode()
-    hash_object = hashlib.md5(seq)
-    hash_string = hash_object.hexdigest()
-    return hash_string
-
-def get_dict(in_csv,name_column,data_column):
-    this_dict = {}
-    with open(in_csv,"r") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            this_dict[row[name_column]] = row[data_column]
-    return this_dict
+repo_path = config["repo_path"].rstrip("/")
+pangoLEARN_path = os.path.join(repo_path, "pangoLEARN")
+pango_designation_path = os.path.join(repo_path, "pango-designation")
 
-def add_to_hash(seq_file):
-    hash_map = {}
-    seq_hash = {}
-    for record in SeqIO.parse(seq_file, "fasta"):
-        seq = str(record.seq).upper().encode()
-        hash_object = hashlib.md5(seq)
-        hash_map[hash_object.hexdigest()] = record.id
-        seq_hash[str(record.seq)] = record.id
-    return hash_map,seq_hash
-
-pangoLEARN_path = config["pangoLEARN_path"].rstrip("/")
-pangolin_path = config["pangolin_path"].rstrip("/")
-pango_designation_path = config["pango_designation_path"].rstrip("/")
-quokka_path = config["quokka_path"].rstrip("/")
+# config["pango_version"] = get_pango_version(pango_designation_path)
 
 data_date = config["data_date"]
 config["trim_start"] = 265
 config["trim_end"] = 29674
 config["lineages_csv"]=f"{pango_designation_path}/lineages.csv"
-config["reference"] = f"{pangolin_path}/pangolin/data/reference.fasta"
-config["outgroups"] = f"{pangoLEARN_path}/pangoLEARN/training/outgroups.csv"
+config["reference"] = f"{pangoLEARN_path}/pangoLEARN/training/reference.fasta"
 config["genbank_ref"] = f"{pangoLEARN_path}/pangoLEARN/training/WH04.gb"
 config["datadir"]= f"/localdisk/home/shared/raccoon-dog/{data_date}_gisaid/publish/gisaid"
 
 rule all:
     input:
         os.path.join(config["outdir"],"alignment.filtered.fasta"),
-        os.path.join(config["outdir"],"decision_tree_rules.zip"),
+        # os.path.join(config["outdir"],"lineage_recall_report.txt"),
         os.path.join(config["outdir"],"pangolearn.init.py"),
+        os.path.join(config["outdir"],"pangolin_data.init.py"),
+        os.path.join(config["outdir"],"training_summary.rf.txt"),
+        os.path.join(config["outdir"],"decision_tree_rules.txt"),
         os.path.join(config["outdir"],"lineage.hash.csv")
 
-rule make_init:
+rule make_plearn_init:
     output:
         init = os.path.join(config["outdir"],"pangolearn.init.py")
     run:
@@ -65,6 +50,22 @@ rule make_init:
             fw.write(f'''_program = "pangoLEARN"
 __version__ = "{pangolearn_new_v}"
 PANGO_VERSION = "{pango_version}"
+
+__all__ = ["training"]
+
+from pangoLEARN import *
+''')
+
+
+rule make_pdata_init:
+    output:
+        init = os.path.join(config["outdir"],"pangolin_data.init.py")
+    run:
+        pango_version = config["pango_version"]
+        with open(output.init,"w") as fw:
+            fw.write(f'''_program = "pangolin_data"
+__version__ = "{pango_version}"
+
 ''')
 
 rule filter_alignment:
@@ -146,7 +147,7 @@ rule get_variants:
     input:
         sam = os.path.join(config["outdir"],"alignment.sam")
     output:
-        csv = os.path.join(config["outdir"],"variants.csv")
+        csv = os.path.join(config["outdir"],"variants.csv") #gisaid.mutations.csv
     shell:
         """
         gofasta sam variants -t {workflow.cores} \
@@ -158,7 +159,7 @@ rule get_variants:
 
 rule add_lineage:
     input:
-        csv = os.path.join(config["outdir"],"variants.csv"),
+        csv = os.path.join(config["outdir"],"variants.csv"), #gisaid.mutations.csv
         lineages = os.path.join(config["outdir"],"lineages.designated.csv")
     output:
         csv = os.path.join(config["outdir"],"variants.lineages.csv")
@@ -179,80 +180,52 @@ rule add_lineage:
                     elif name in lineages_dict:
                         fw.write(f"{name},{variants},{lineages_dict[name]},\n")
                         
-
-rule downsample:
+rule filter_metadata:
     input:
         csv = os.path.join(config["outdir"],"variants.lineages.csv"),
         fasta = os.path.join(config["outdir"],"alignment.filtered.fasta")
     output:
-        csv = os.path.join(config["outdir"],"metadata.copy.csv"),
-        fasta = os.path.join(config["outdir"],"alignment.downsample.fasta")
+        csv = os.path.join(config["outdir"],"metadata.final.csv")
     run:
-        downsample.downsample(
-            input.csv, 
-            output.csv, 
-            input.fasta, 
-            output.fasta, 
-            1, config["outgroups"], 
-            False, 
-            False, 
-            10)
-
-rule filter_metadata:
-    input:
-        csv = os.path.join(config["outdir"],"metadata.copy.csv"),
-        fasta = os.path.join(config["outdir"],"alignment.downsample.fasta")
-    output:
-        csv = os.path.join(config["outdir"],"metadata.downsample.csv")
-    run:
-        in_downsample = {}
+        in_list = {}
         for record in SeqIO.parse(input.fasta,"fasta"):
-            in_downsample[record.id] = 1
+            in_list[record.id] = 1
 
         with open(output.csv, "w") as fw:
             fw.write("sequence_name,lineage\n")
             with open(input.csv, "r") as f:
                 reader = csv.DictReader(f)
                 for row in reader:
-                    if row["sequence_name"] in in_downsample:
+                    if row["sequence_name"] in in_list:
                         name = row["sequence_name"]
                         lineage = row["lineage"]
                         fw.write(f"{name},{lineage}\n")
 
-
 rule get_relevant_postions:
     input:
-        fasta = os.path.join(config["outdir"],"alignment.downsample.fasta"),
-        csv = os.path.join(config["outdir"],"metadata.downsample.csv"),
+        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+        csv = os.path.join(config["outdir"],"metadata.final.csv"),
         reference = config["reference"]
-    params:
-        path_to_script = quokka_path
     output:
         relevant_pos_obj = os.path.join(config["outdir"],"relevantPositions.pickle"),
-    shell:
-        """
-        python {params.path_to_script}/quokka/getRelevantLocationsObject.py \
-        {input.reference:q} \
-        {input.fasta} \
-        {input.csv:q} \
-        {config[outdir]}
-        """
+    run:
+        get_relevant_positions(input.csv,input.fasta,input.reference,output.relevant_pos_obj)
 
-rule run_training:
+rule run_rf_training:
     input:
-        fasta = os.path.join(config["outdir"],"alignment.downsample.fasta"),
-        csv = os.path.join(config["outdir"],"metadata.downsample.csv"),
+        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+        csv = os.path.join(config["outdir"],"metadata.final.csv"),
         reference = config["reference"],
         relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
     params:
         path_to_script = pangoLEARN_path
     output:
-        headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
-        model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
-        txt = os.path.join(config["outdir"],"training_summary.txt")
+        headers = os.path.join(config["outdir"],"randomForestHeaders_v1.joblib"),
+        model = os.path.join(config["outdir"],"randomForest_v1.joblib"),
+        txt = os.path.join(config["outdir"],"training_summary.rf.txt")
     shell:
         """
-        python {params.path_to_script}/pangoLEARN/training/pangoLEARNDecisionTree_v1.py \
+        python {params.path_to_script}/pangoLEARN/training/pangoLEARNRandomForest_v1.py \
         {input.csv:q} \
         {input.fasta} \
         {input.reference:q} \
@@ -261,27 +234,38 @@ rule run_training:
         > {output.txt:q}
         """
 
-rule get_recall:
+rule run_dt_training:
     input:
-        txt = rules.run_training.output.txt
+        fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+        csv = os.path.join(config["outdir"],"metadata.final.csv"),
+        reference = config["reference"],
+        relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
     params:
         path_to_script = pangoLEARN_path
     output:
-        txt = os.path.join(config["outdir"],"lineage_recall_report.txt")
+        headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
+        model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
+        txt = os.path.join(config["outdir"],"training_summary.dt.txt")
     shell:
         """
-        python {params.path_to_script}/pangoLEARN/training/processOutputFile.py {input.txt} > {output.txt}
+        python {params.path_to_script}/pangoLEARN/training/pangoLEARNDecisionTree_v1.py \
+        {input.csv:q} \
+        {input.fasta} \
+        {input.reference:q} \
+        {config[outdir]} \
+        {input.relevant_pos_obj} \
+        > {output.txt:q}
         """
 
 rule get_decisions:
     input:
         headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
         model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
-        txt = rules.run_training.output.txt
+        txt = rules.run_dt_training.output.txt
     params:
         path_to_script = pangoLEARN_path
     output:
-        txt = os.path.join(config["outdir"],"tree_rules.txt"),
+        txt = os.path.join(config["outdir"],"decision_tree_rules.txt"),
         zipped = os.path.join(config["outdir"],"decision_tree_rules.zip")
     shell:
         """
@@ -290,6 +274,18 @@ rule get_decisions:
         > {output.txt:q} && zip {output.zipped:q} {output.txt:q}
         """
 
+# rule get_recall:
+#     input:
+#         txt = rules.run_rf_training.output.txt
+#     params:
+#         path_to_script = pangoLEARN_path
+#     output:
+#         txt = os.path.join(config["outdir"],"lineage_recall_report.txt")
+#     shell:
+#         """
+#         python {params.path_to_script}/pangoLEARN/training/processOutputFile.py {input.txt} > {output.txt}
+#         """
+
 rule create_hash:
     input:
         fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),


=====================================
pangoLEARN/scripts/training_runner.sh
=====================================
@@ -12,7 +12,7 @@ echo $OUTDIR
 if [ -d $OUTDIR ] 
 then
     echo "Directory $OUTDIR exists." 
-else
+els
     mkdir $OUTDIR 
     echo "Directory $OUTDIR does not exist, making it."
 fi
@@ -32,32 +32,22 @@ then
 else
     DATA_DATE=$2
 fi
-echo "$UID training version $DATA_DATE"
+echo "training version $DATA_DATE"
 
 # LATEST_DATA=$(ls -td /localdisk/home/shared/raccoon-dog/2021*_gisaid/publish/gisaid | head -n 1)
 
-REPO_PATH=/localdisk/home/$UID/repositories
+REPO_PATH=/localdisk/home/s1680070/repositories
 
 PANGO_PATH=$REPO_PATH/pango-designation
 PLEARN_PATH=$REPO_PATH/pangoLEARN
-PANGOLIN_PATH=$REPO_PATH/pangolin
-QUOKKA_PATH=$REPO_PATH/quokka
 
 echo "pango designation path $PANGO_PATH"
 echo "pangoLEARN path $PLEARN_PATH"
-echo "pangolin path $PANGOLIN_PATH"
-
 
 cd $PANGO_PATH && git pull #gets any updates to the reports in the data directory
-PANGO_V=$(git tag --points-at HEAD)
-echo "pango version $PANGO_V"
-
+PANGO_VERSION=$(git describe --tags --abbrev=0)
+echo $PANGO_VERSION
 cd /localdisk/home/shared/raccoon-dog/ #gets any updates to the reports in the data directory
-echo "--config outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_V"
-echo "pangoLEARN training starting" | mail -s "update lineageTree.pb with pango designation version $PANGO_V" angie at soe.ucsc.edu
-snakemake --snakefile $PLEARN_PATH/pangoLEARN/scripts/curate_alignment.smk --rerun-incomplete --nolock --cores 1 --config pango_designation_path=$PANGO_PATH pangolin_path=$PANGOLIN_PATH pangoLEARN_path=$PLEARN_PATH quokka_path=$QUOKKA_PATH outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_V
-
-# cp $OUTDIR/pangolearn.init.py   /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/__init__.py
-# cp $OUTDIR/decision*   /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/
-# cp $OUTDIR/metadata.downsample.csv   /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/lineages.downsample.csv
-# cp $OUTDIR/lineage.hash.csv   /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/lineages.hash.csv
+echo "--config outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY "
+echo "pangoLEARN training starting" | mail -s "update lineageTree.pb with pango designation version $PANGO_VERSION" angie at soe.ucsc.edu
+snakemake --snakefile $PLEARN_PATH/pangoLEARN/scripts/curate_alignment.smk --rerun-incomplete --nolock --cores 1 --config repo_path=$REPO_PATH outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_VERSION


=====================================
pangoLEARN/training/get_lineage_positions.py
=====================================
@@ -0,0 +1,52 @@
+import csv
+import pickle
+import collections
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Align.AlignInfo import SummaryInfo
+
+## outputs a dataframe of positions to input in training
+
+def get_lineage_cns50_sites(lineage, lineage_taxa, sequences_index, reference_sequence): 
+            
+    ### output of list of sequence_IDs for the given lineage
+    
+    lineage_seqs = MultipleSeqAlignment([])
+    for taxon in lineage_taxa:
+        lineage_seqs.append(sequences_index[taxon])
+
+    info = SummaryInfo(lineage_seqs)
+    consensus_sequence =  info.gap_consensus(
+    threshold=0.50, 
+    ambiguous='N')
+    
+    nuc_position = []
+    for i in range(len(consensus_sequence)):
+        if reference_sequence[i] != "N":
+            if consensus_sequence[i] != "N":
+                if consensus_sequence[i] != reference_sequence[i]:
+                    nuc_position.append(i)
+    return nuc_position
+
+def get_relevant_positions(designation_file,seq_file,ref_file,outfile):
+    reference = SeqIO.read(ref_file, "fasta")
+    sequences_index = SeqIO.index(seq_file, "fasta")
+    lineage_designations = collections.defaultdict(list)
+    lineage_set = set()
+
+    with open(designation_file, "r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            lineage_designations[row["lineage"]].append(row["sequence_name"])
+            lineage_set.add(row["lineage"])
+    
+    final_positions = set()
+    for lineage in lineage_set:
+        print(f"Getting positions for lineage {lineage}")
+        positions = get_lineage_cns50_sites(lineage, lineage_designations[lineage], sequences_index, reference)
+        print(f"\tFound {len(positions)}")
+        for i in positions:
+            final_positions.add(i)
+
+    with open(outfile, 'wb') as pickle_file:
+        pickle.dump(final_positions, pickle_file)
\ No newline at end of file


=====================================
pangoLEARN/training/outgroups.csv deleted
=====================================
@@ -1,11 +0,0 @@
-lineage,outgroup
-A,Wuhan/WH04/2020
-B,Wuhan/WHU01/2020
-B.1,Italy/ABR-IZSGC-TE5166/2020
-B.1.1,Germany/BY-MVP-V2010837/2020
-B.1.177,Spain/VC-IBV-98006461/2020
-B.1.1.7,England/MILK-9E05B3/2020
-B.1.160,Belgium/rega-10021225/2020
-B.1.2,USA/TX-HMH-MCoV-16306/2020
-B.1.258,England/LEED-2A8A64/2020
-B.1.243,USA/TX-HMH-MCoV-18678/2020


=====================================
pangoLEARN/training/pangoLEARNDecisionTree_v1.py
=====================================
@@ -12,6 +12,8 @@ import os
 from sklearn.model_selection import cross_val_score
 from Bio import SeqIO
 import pickle
+import time
+import os.path
 
 # file with lineage assignments
 lineage_file = sys.argv[1]
@@ -89,16 +91,9 @@ def readInAndFormatData():
 	dataList.append(getDataLine(referenceId, referenceSeq))
 
 	# create a dictionary of sequence ids to their assigned lineages
-	with open(lineage_file, 'r') as f:
-		for line in f:
-			line = line.strip()
-
-			split = line.split(",")
-
-			idToLineage[split[0]] = split[1]
-
-	# close the file
-	f.close()
+	lineage_designations = pd.read_csv(lineage_file, delimiter=",", dtype=str)
+	for index, row in lineage_designations.iterrows(): 
+		idToLineage[row["sequence_name"]] = row["lineage"]
 
 	seq_dict = {rec.id : rec.seq for rec in SeqIO.parse(sequence_file, "fasta")}
 
@@ -301,15 +296,14 @@ h = feature_cols.pop(0)
 X = pima[feature_cols]
 y = pima[h]
 
-# separate the data frame into testing/training data sets. 25% of the data will be used for training, 75% for test.
 X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=testing_percentage,random_state=0)
 
 print("training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
 
 header_out = os.path.join(sys.argv[4],"decisionTreeHeaders_v1.joblib")
-joblib.dump(headers, header_out, compress=9)
+joblib.dump(headers, header_out, compress=('lzma', 9))
 
-# instantiate the random forest with 1000 trees
+# instantiate the random forest with 50 trees
 dt = DecisionTreeClassifier()
 
 # fit the model
@@ -340,7 +334,8 @@ print(metrics.classification_report(y_test, y_pred, digits=3))
 # save the model files to compressed joblib files
 # using joblib instead of pickle because these large files need to be compressed
 model_out = os.path.join(sys.argv[4],"decisionTree_v1.joblib")
-joblib.dump(dt,  model_out, compress=9)
+joblib.dump(dt,  model_out, compress=('lzma', 9))
+
 
 print("model files created", flush=True)
 


=====================================
pangoLEARN/training/pangoLEARNRandomForest_v1.py
=====================================
@@ -0,0 +1,355 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+from sklearn.datasets import make_classification
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, make_scorer
+from datetime import datetime
+import joblib
+import sys
+import os
+from sklearn.model_selection import cross_val_score
+from Bio import SeqIO
+import pickle
+import time
+import os.path
+
+# file with lineage assignments
+lineage_file = sys.argv[1]
+# file with sequences
+sequence_file = sys.argv[2]
+# how much of the data will be used for testing, instead of training
+testing_percentage = 0.0000000001
+
+relevant_positions = pickle.load(open(sys.argv[5], 'rb'))
+relevant_positions.add(0)
+
+# the path to the reference file. 
+# This reference sequence must be the same as is used in the pangolearn script!!
+referenceFile = sys.argv[3]
+
+# data storage
+dataList = []
+# dict for lookup efficiency
+indiciesToKeep = dict()
+
+referenceId = "Wuhan/WH04/2020"
+referenceSeq = ""
+
+idToLineage = dict()
+idToSeq = dict()
+
+mustKeepIds = []
+mustKeepLineages = []
+
+
+# function for handling weird sequence characters
+def clean(x, loc):
+	x = x.upper()
+	
+	if x == 'T' or x == 'A' or x == 'G' or x == 'C' or x == '-':
+		return x
+
+	if x == 'U':
+		return 'T'
+
+	# otherwise return value from reference
+	return referenceSeq[loc]
+
+def findReferenceSeq():
+	with open(referenceFile) as f:
+		currentSeq = ""
+
+		for line in f:
+			if ">" not in line:
+				currentSeq = currentSeq + line.strip()
+
+	f.close()
+	return currentSeq
+
+
+def getDataLine(seqId, seq):
+	dataLine = []
+	dataLine.append(seqId)
+
+	newSeq = ""
+
+	# for each character in the sequence
+	for index in range(len(seq)):
+		newSeq = newSeq + clean(seq[index], index)
+
+	dataLine.append(newSeq)
+	
+	return dataLine
+
+
+def readInAndFormatData():
+
+	# add the data line for the reference seq
+	idToLineage[referenceId] = "A"
+	dataList.append(getDataLine(referenceId, referenceSeq))
+
+	# create a dictionary of sequence ids to their assigned lineages
+	lineage_designations = pd.read_csv(lineage_file, delimiter=",", dtype=str)
+	for index, row in lineage_designations.iterrows(): 
+		idToLineage[row["sequence_name"]] = row["lineage"]
+
+	seq_dict = {rec.id : rec.seq for rec in SeqIO.parse(sequence_file, "fasta")}
+
+	print("files read in, now processing")
+
+	for key in seq_dict.keys():
+		if key in idToLineage:
+			dataList.append(getDataLine(key, seq_dict[key]))
+		else:
+			print("unable to find the lineage classification for: " + key)
+
+
+# find columns in the data list which always have the same value
+def findColumnsWithoutSNPs():
+
+	# for each index in the length of each sequence
+	for index in range(len(dataList[0][1])):
+		keep = False
+
+		# loop through all lines
+		for line in dataList:
+
+			# if there is a difference somewhere, then we want to keep it
+			if dataList[0][1][index] != line[1][index] or index == 0:
+				keep = True
+				break
+
+		# otherwise, save it
+		if keep and index in relevant_positions:
+			indiciesToKeep[index] = True
+
+
+# remove columns from the data list which don't have any SNPs. We do this because
+# these columns won't be relevant for a logistic regression which is trying to use
+# differences between sequences to assign lineages
+def removeOtherIndices(indiciesToKeep):
+
+	# instantiate the final list
+	finalList = []
+
+	indicies = list(indiciesToKeep.keys())
+	indicies.sort()
+
+	# while the dataList isn't empty
+	while len(dataList) > 0:
+
+		# pop the first line
+		line = dataList.pop(0)
+		seqId = line.pop(0)
+
+		line = line[0]
+		# initialize the finalLine
+		finalLine = []
+
+		for index in indicies:
+			if index == 0:
+				# if its the first index, then that's the lineage assignment, so keep it
+				finalLine.append(seqId)
+			else:
+				# otherwise keep everything at the indices in indiciesToKeep
+				finalLine.append(line[index])
+
+		# save the finalLine to the finalList
+		finalList.append(finalLine)
+
+	# return
+	return finalList
+
+def allEqual(list):
+		entries = dict()
+
+		for i in list:
+			if i not in entries:
+				entries[i] = True
+
+		return len(entries) == 1
+
+def removeAmbiguous():
+	idsToRemove = set()
+	lineMap = dict()
+	idMap = dict()
+
+	for line in dataList:
+		keyString = ",".join(line[1:])
+
+		if keyString not in lineMap:
+			lineMap[keyString] = []
+			idMap[keyString] = []
+ 
+		if line[0] in idToLineage:
+			lineMap[keyString].append(idToLineage[line[0]])
+			idMap[keyString].append(line[0])
+		else:
+			print("diagnostics")
+			print(line[0])
+			print(keyString)
+			print(line)
+	for key in lineMap:
+		if not allEqual(lineMap[key]):
+
+			skipRest = False
+
+			# see if any protected lineages are contained in the set, if so keep those ids
+			for lineage in lineMap[key]:
+				if lineage in mustKeepLineages:
+					skipRest = True
+
+					for i in idMap[key]:
+						if lineage != idToLineage[i] and i not in mustKeepIds:
+							idsToRemove.add(i)
+
+			# none of the lineages are protected, fire at will
+			if not skipRest:
+
+				lineageToCounts = dict()
+
+				aLineage = False
+				# find most common lineage
+				for lineage in lineMap[key]:
+					if lineage not in lineageToCounts:
+						lineageToCounts[lineage] = 0
+
+					lineageToCounts[lineage] = lineageToCounts[lineage] + 1
+					aLineage = lineage
+
+				m = aLineage
+				for lineage in lineageToCounts:
+					if lineageToCounts[lineage] > lineageToCounts[m]:
+						m = lineage
+
+
+				for i in idMap[key]:
+					if m != idToLineage[i]:
+						idsToRemove.add(i)
+
+	newList = []
+
+	print("keeping indicies:")
+
+	for line in dataList:
+		if line[0] not in idsToRemove:
+			print(line[0])
+			line[0] = idToLineage[line[0]]
+			newList.append(line)
+
+	return newList
+
+
+print("reading in data " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+referenceSeq = findReferenceSeq()
+
+readInAndFormatData()
+
+print("processing snps, formatting data " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+findColumnsWithoutSNPs()
+
+dataList = removeOtherIndices(indiciesToKeep)
+
+print("# sequences before blacklisting")
+print(len(dataList))
+
+dataList = removeAmbiguous()
+
+print("# sequences after blacklisting")
+print(len(dataList))
+
+# headers are the original genome locations
+headers = list(indiciesToKeep.keys())
+headers[0] = "lineage"
+
+print("setting up training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+# checkpoint_file = os.path.join(sys.argv[4], "dataList.pickle")
+# dataList.to_pickle(checkpoint_file)
+# headers
+
+pima = pd.DataFrame(dataList, columns=headers)
+
+# nucleotide symbols which can appear
+categories = ['A', 'C', 'G', 'T', '-']
+
+# one hot encoding of all headers other than the first which is the lineage
+dummyHeaders = headers[1:]
+
+# add extra rows to ensure all of the categories are represented, as otherwise 
+# not enough columns will be created when we call get_dummies
+for i in categories:
+	line = [i] * len(dataList[0])
+	pima.loc[len(pima)] = line
+
+# get one-hot encoding
+pima = pd.get_dummies(pima, columns=dummyHeaders)
+
+# get rid of the fake data we just added
+pima.drop(pima.tail(len(categories)).index, inplace=True)
+
+feature_cols = list(pima)
+print(feature_cols)
+
+# remove the last column from the data frame. This is because we are trying to predict these values.
+h = feature_cols.pop(0)
+X = pima[feature_cols]
+y = pima[h]
+
+X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=testing_percentage,random_state=0)
+
+print("training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+header_out = os.path.join(sys.argv[4],"randomForestHeaders_v1.joblib")
+joblib.dump(headers, header_out, compress=('lzma', 9))
+
+# instantiate the random forest with 50 trees
+rf = RandomForestClassifier(n_estimators=12, max_features=0.05, min_samples_split=10)
+
+# fit the model
+rf.fit(X,y)
+
+print("testing " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+# classify the test data
+y_pred=rf.predict(X_test)
+
+print(y_pred)
+
+# get the scores from these predictions
+y_scores = rf.predict_proba(X_test)
+
+print("generating statistics " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+#print the confusion matrix
+print("--------------------------------------------")
+print("Confusion Matrix")
+cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
+print(cnf_matrix)
+
+print("--------------------------------------------")
+print("Classification report")
+print(metrics.classification_report(y_test, y_pred, digits=3))
+
+# save the model files to compressed joblib files
+# using joblib instead of pickle because these large files need to be compressed
+model_out = os.path.join(sys.argv[4],"randomForest_v1.joblib")
+joblib.dump(rf,  model_out, compress=('lzma', 9))
+
+
+print("model files created", flush=True)
+
+# this method is used below when running 10-fold cross validation. It ensures
+# that the per-lineage statistics are generated for each cross-fold
+def classification_report_with_accuracy_score(y_true, y_pred):
+	print("--------------------------------------------")
+	print("Crossfold Classification Report")
+	print(metrics.classification_report(y_true, y_pred, digits=3))
+	return accuracy_score(y_true, y_pred)
+
+# optionally, run 10-fold cross validation (comment this out if not needed as it takes a while to run)
+# cross_validation_scores = cross_val_score(rf, X=X, y=y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))


=====================================
pangoLEARN/training/processOutputFile.py
=====================================
@@ -26,6 +26,7 @@ with open(outputfile, 'r') as f:
 		split = line.split()
 
 		if "macro avg" in line or "weighted avg" in line:
+
 			name = split[0] + " " + split[1]
 
 			if name not in nameToLineage:
@@ -36,7 +37,7 @@ with open(outputfile, 'r') as f:
 			nameToLineage[name].f1s.append(float(split[4]))
 			nameToLineage[name].supports.append(int(split[5]))
 
-		if len(split) == 5 and ":" not in line:
+		if len(split) == 5 and ":" not in line and "read" not in line:
 			name = split[0]
 
 			if name not in nameToLineage:
@@ -54,4 +55,4 @@ for key in nameToLineage:
 		nameToLineage[key].printStats()
 
 nameToLineage["macro avg"].printStats()
-nameToLineage["weighted avg"].printStats()
\ No newline at end of file
+nameToLineage["weighted avg"].printStats()


=====================================
pangoLEARN/training/reference.fasta
=====================================
@@ -0,0 +1,2 @@
+>outgroup_A
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTTAAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACAACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTACAACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGATATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGTCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAAGAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCAGTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTCTCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACTAAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTCACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
\ No newline at end of file


=====================================
pangoLEARN/training/utils.py
=====================================
@@ -0,0 +1,61 @@
+from Bio import SeqIO
+import os
+import csv
+import sys
+
+import collections
+import hashlib
+import collections
+import csv
+
+def version_from_init(init_file):
+    version=None
+    with open(init_file, "r") as fr:
+        for l in fr:
+            if l.startswith("__version__"):
+                l = l.rstrip("\n")
+                version = l.split('=')[1]
+                version = version.replace('"',"").replace(" ","")
+                break
+    return version
+
+def get_pango_version(pango_path):
+    version =""
+
+    for r,d,f in os.walk(pango_path):
+        for fn in f:
+            if fn == "__init__.py":
+                version = version_from_init(os.path.join(r, fn))
+                if not version:
+                    continue
+    print("Pango version is:", version)
+
+    if not version:
+        sys.sterr.write("No version found at pango path")
+        sys.exit(-1)
+    else:
+        return version
+
+def get_hash_string(record):
+    seq = str(record.seq).upper().encode()
+    hash_object = hashlib.md5(seq)
+    hash_string = hash_object.hexdigest()
+    return hash_string
+
+def get_dict(in_csv,name_column,data_column):
+    this_dict = {}
+    with open(in_csv,"r") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            this_dict[row[name_column]] = row[data_column]
+    return this_dict
+
+def add_to_hash(seq_file):
+    hash_map = {}
+    seq_hash = {}
+    for record in SeqIO.parse(seq_file, "fasta"):
+        seq = str(record.seq).upper().encode()
+        hash_object = hashlib.md5(seq)
+        hash_map[hash_object.hexdigest()] = record.id
+        seq_hash[str(record.seq)] = record.id
+    return hash_map,seq_hash
\ No newline at end of file



View it on GitLab: https://salsa.debian.org/med-team/python-pangolearn/-/commit/e41ea9084bcc24c85cac00d08c389bb1b6207c12

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-pangolearn/-/commit/e41ea9084bcc24c85cac00d08c389bb1b6207c12
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221223/acdb2f5b/attachment-0001.htm>


More information about the debian-med-commit mailing list