[med-svn] [Git][med-team/python-pangolearn][master] 5 commits: routine-update: New upstream version
Andreas Tille (@tille)
gitlab at salsa.debian.org
Fri Dec 23 15:50:39 GMT 2022
Andreas Tille pushed to branch master at Debian Med / python-pangolearn
Commits:
641e9307 by Andreas Tille at 2022-12-23T16:26:11+01:00
routine-update: New upstream version
- - - - -
e41ea908 by Andreas Tille at 2022-12-23T16:26:18+01:00
New upstream version 2022-07-09+dfsg
- - - - -
91a34c4f by Andreas Tille at 2022-12-23T16:28:43+01:00
Update upstream source from tag 'upstream/2022-07-09+dfsg'
Update to upstream version '2022-07-09+dfsg'
with Debian dir 16412f555cd46b95837f64d33a64f7116fa6ef56
- - - - -
71f5c3ee by Andreas Tille at 2022-12-23T16:28:45+01:00
routine-update: Standards-Version: 4.6.2
- - - - -
2fdb284a by Andreas Tille at 2022-12-23T16:33:53+01:00
routine-update: Ready to upload to unstable
- - - - -
20 changed files:
- + .gitignore
- debian/changelog
- debian/control
- pangoLEARN/__init__.py
- pangoLEARN/data/decisionTree_recall_report.csv
- pangoLEARN/data/decision_tree_rules.zip
- pangoLEARN/data/lineageTree.pb
- pangoLEARN/data/lineages.downsample.csv
- pangoLEARN/data/lineages.hash.csv
- pangoLEARN/data/randomForest_recall_report.csv
- + pangoLEARN/scripts/copy_files_push_to_branch.sh
- pangoLEARN/scripts/curate_alignment.smk
- pangoLEARN/scripts/training_runner.sh
- + pangoLEARN/training/get_lineage_positions.py
- − pangoLEARN/training/outgroups.csv
- pangoLEARN/training/pangoLEARNDecisionTree_v1.py
- + pangoLEARN/training/pangoLEARNRandomForest_v1.py
- pangoLEARN/training/processOutputFile.py
- + pangoLEARN/training/reference.fasta
- + pangoLEARN/training/utils.py
Changes:
=====================================
.gitignore
=====================================
@@ -0,0 +1,127 @@
+# Editors
+.vscode/
+.idea/
+
+# Vagrant
+.vagrant/
+
+# Mac/OSX
+.DS_Store
+
+# Windows
+Thumbs.db
+
+# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.snakemake/
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
=====================================
debian/changelog
=====================================
@@ -1,3 +1,10 @@
+python-pangolearn (2022-07-09+dfsg-1) unstable; urgency=medium
+
+ * New upstream version
+ * Standards-Version: 4.6.2 (routine-update)
+
+ -- Andreas Tille <tille at debian.org> Fri, 23 Dec 2022 16:29:12 +0100
+
python-pangolearn (2022-02-02+dfsg-1) unstable; urgency=medium
* Initial release (Closes: #986458)
=====================================
debian/control
=====================================
@@ -8,7 +8,7 @@ Build-Depends: debhelper-compat (= 13),
dh-python,
python3-all,
python3-setuptools
-Standards-Version: 4.6.0
+Standards-Version: 4.6.2
Vcs-Browser: https://salsa.debian.org/med-team/python-pangolearn
Vcs-Git: https://salsa.debian.org/med-team/python-pangolearn.git
Homepage: https://github.com/cov-lineages/pangoLEARN
=====================================
pangoLEARN/__init__.py
=====================================
@@ -1,3 +1,7 @@
_program = "pangoLEARN"
-__version__ = "2022-02-02"
-PANGO_VERSION = "v1.2.124"
+__version__ = "2022-07-09"
+PANGO_VERSION = "v1.12"
+
+__all__ = ["training"]
+
+from pangoLEARN import *
=====================================
pangoLEARN/data/decisionTree_recall_report.csv
=====================================
The diff for this file was not included because it is too large.
=====================================
pangoLEARN/data/decision_tree_rules.zip
=====================================
Binary files a/pangoLEARN/data/decision_tree_rules.zip and b/pangoLEARN/data/decision_tree_rules.zip differ
=====================================
pangoLEARN/data/lineageTree.pb
=====================================
The diff for this file was not included because it is too large.
=====================================
pangoLEARN/data/lineages.downsample.csv
=====================================
The diff for this file was not included because it is too large.
=====================================
pangoLEARN/data/lineages.hash.csv
=====================================
The diff for this file was not included because it is too large.
=====================================
pangoLEARN/data/randomForest_recall_report.csv
=====================================
The diff for this file was not included because it is too large.
=====================================
pangoLEARN/scripts/copy_files_push_to_branch.sh
=====================================
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+OUTDIR=$1
+PANGO_VERSION=$2
+PLEARN_VERSION=$3
+REPO_PATH=/localdisk/home/s1680070/repositories
+
+echo $PLEARN_VERSION
+echo $PANGO_VERSION
+
+cd $REPO_PATH/pangolin-data && git pull
+git checkout -b "origin/prerelease_$PANGO_VERSION" "remotes/origin/prerelease_$PANGO_VERSION" || git checkout -b "prerelease_$PANGO_VERSION"
+git pull
+
+cd $REPO_PATH/pangoLEARN && git pull
+git checkout "prerelease_$PLEARN_VERSION" || git checkout -b "prerelease_$PLEARN_VERSION"
+git pull
+
+cp $OUTDIR/pangolearn.init.py $REPO_PATH/pangoLEARN/pangoLEARN/__init__.py
+cp $OUTDIR/pangolin_data.init.py $REPO_PATH/pangolin-data/pangolin_data/__init__.py
+
+cp $OUTDIR/decisionTreeHeaders_v1.joblib $REPO_PATH/pangoLEARN/pangoLEARN/data/decisionTreeHeaders_v1.joblib
+cp $OUTDIR/decisionTree_v1.joblib $REPO_PATH/pangoLEARN/pangoLEARN/data/decisionTree_v1.joblib
+cp $OUTDIR/decision_tree_rules.zip $REPO_PATH/pangoLEARN/pangoLEARN/data/decision_tree_rules.zip
+
+cp $OUTDIR/random* $REPO_PATH/pangoLEARN/pangoLEARN/data/
+cp $OUTDIR/random* $REPO_PATH/pangoLEARN/pangoLEARN/data/
+
+cp $OUTDIR/metadata.final.csv $REPO_PATH/pangoLEARN/pangoLEARN/data/lineages.downsample.csv
+cp $OUTDIR/lineage.hash.csv $REPO_PATH/pangoLEARN/pangoLEARN/data/lineages.hash.csv
+
+cp $OUTDIR/random* $REPO_PATH/pangolin-data/pangolin_data/data/
+cp $OUTDIR/lineage.hash.csv $REPO_PATH/pangolin-data/pangolin_data/data/lineages.hash.csv
+cp $REPO_PATH/pango-designation/pango_designation/alias_key.json $REPO_PATH/pangolin-data/pangolin_data/data/
+
+
+cd $REPO_PATH/pangoLEARN && git pull
+
+cp $REPO_PATH/pangolin-data/pangolin_data/data/lineageTree.pb $REPO_PATH/pangoLEARN/pangoLEARN/data/lineageTree.pb
+
+git add $REPO_PATH/pangoLEARN/pangoLEARN/data/*
+git add $REPO_PATH/pangoLEARN/pangoLEARN/__init__.py
+git status
+
+git commit -m "adding latest decision tree and rf model to pangoLEARN repo for trained version $PLEARN_VERSION corresponding to $PANGO_VERSION"
+git push --set-upstream origin "prerelease_$PLEARN_VERSION"
+git checkout master
+
+cd $REPO_PATH/pangolin-data && git pull
+git status
+git add $REPO_PATH/pangolin-data/pangolin_data/data/*
+git add $REPO_PATH/pangolin-data/pangolin_data/__init__.py
+git commit -m "adding latest hash, alias file and rf model corresponding to $PANGO_VERSION"
+git push --set-upstream origin "prerelease_$PANGO_VERSION"
+git push origin HEAD:"prerelease_$PANGO_VERSION"
+git checkout main
=====================================
pangoLEARN/scripts/curate_alignment.smk
=====================================
@@ -1,61 +1,46 @@
-import csv
-from Bio import SeqIO
import os
+import sys
+sys.path.insert(0, '/localdisk/home/s1680070/repositories/pangoLEARN')
+
+from Bio import SeqIO
+
import collections
import hashlib
import collections
import csv
+from pangoLEARN.training.get_lineage_positions import get_relevant_positions
+from pangoLEARN.training.utils import *
+
from Bio import SeqIO
-from pangoLEARN.training import downsample
from datetime import date
today = date.today()
+csv.field_size_limit(sys.maxsize)
-def get_hash_string(record):
- seq = str(record.seq).upper().encode()
- hash_object = hashlib.md5(seq)
- hash_string = hash_object.hexdigest()
- return hash_string
-
-def get_dict(in_csv,name_column,data_column):
- this_dict = {}
- with open(in_csv,"r") as f:
- reader = csv.DictReader(f)
- for row in reader:
- this_dict[row[name_column]] = row[data_column]
- return this_dict
+repo_path = config["repo_path"].rstrip("/")
+pangoLEARN_path = os.path.join(repo_path, "pangoLEARN")
+pango_designation_path = os.path.join(repo_path, "pango-designation")
-def add_to_hash(seq_file):
- hash_map = {}
- seq_hash = {}
- for record in SeqIO.parse(seq_file, "fasta"):
- seq = str(record.seq).upper().encode()
- hash_object = hashlib.md5(seq)
- hash_map[hash_object.hexdigest()] = record.id
- seq_hash[str(record.seq)] = record.id
- return hash_map,seq_hash
-
-pangoLEARN_path = config["pangoLEARN_path"].rstrip("/")
-pangolin_path = config["pangolin_path"].rstrip("/")
-pango_designation_path = config["pango_designation_path"].rstrip("/")
-quokka_path = config["quokka_path"].rstrip("/")
+# config["pango_version"] = get_pango_version(pango_designation_path)
data_date = config["data_date"]
config["trim_start"] = 265
config["trim_end"] = 29674
config["lineages_csv"]=f"{pango_designation_path}/lineages.csv"
-config["reference"] = f"{pangolin_path}/pangolin/data/reference.fasta"
-config["outgroups"] = f"{pangoLEARN_path}/pangoLEARN/training/outgroups.csv"
+config["reference"] = f"{pangoLEARN_path}/pangoLEARN/training/reference.fasta"
config["genbank_ref"] = f"{pangoLEARN_path}/pangoLEARN/training/WH04.gb"
config["datadir"]= f"/localdisk/home/shared/raccoon-dog/{data_date}_gisaid/publish/gisaid"
rule all:
input:
os.path.join(config["outdir"],"alignment.filtered.fasta"),
- os.path.join(config["outdir"],"decision_tree_rules.zip"),
+ # os.path.join(config["outdir"],"lineage_recall_report.txt"),
os.path.join(config["outdir"],"pangolearn.init.py"),
+ os.path.join(config["outdir"],"pangolin_data.init.py"),
+ os.path.join(config["outdir"],"training_summary.rf.txt"),
+ os.path.join(config["outdir"],"decision_tree_rules.txt"),
os.path.join(config["outdir"],"lineage.hash.csv")
-rule make_init:
+rule make_plearn_init:
output:
init = os.path.join(config["outdir"],"pangolearn.init.py")
run:
@@ -65,6 +50,22 @@ rule make_init:
fw.write(f'''_program = "pangoLEARN"
__version__ = "{pangolearn_new_v}"
PANGO_VERSION = "{pango_version}"
+
+__all__ = ["training"]
+
+from pangoLEARN import *
+''')
+
+
+rule make_pdata_init:
+ output:
+ init = os.path.join(config["outdir"],"pangolin_data.init.py")
+ run:
+ pango_version = config["pango_version"]
+ with open(output.init,"w") as fw:
+ fw.write(f'''_program = "pangolin_data"
+__version__ = "{pango_version}"
+
''')
rule filter_alignment:
@@ -146,7 +147,7 @@ rule get_variants:
input:
sam = os.path.join(config["outdir"],"alignment.sam")
output:
- csv = os.path.join(config["outdir"],"variants.csv")
+ csv = os.path.join(config["outdir"],"variants.csv") #gisaid.mutations.csv
shell:
"""
gofasta sam variants -t {workflow.cores} \
@@ -158,7 +159,7 @@ rule get_variants:
rule add_lineage:
input:
- csv = os.path.join(config["outdir"],"variants.csv"),
+ csv = os.path.join(config["outdir"],"variants.csv"), #gisaid.mutations.csv
lineages = os.path.join(config["outdir"],"lineages.designated.csv")
output:
csv = os.path.join(config["outdir"],"variants.lineages.csv")
@@ -179,80 +180,52 @@ rule add_lineage:
elif name in lineages_dict:
fw.write(f"{name},{variants},{lineages_dict[name]},\n")
-
-rule downsample:
+rule filter_metadata:
input:
csv = os.path.join(config["outdir"],"variants.lineages.csv"),
fasta = os.path.join(config["outdir"],"alignment.filtered.fasta")
output:
- csv = os.path.join(config["outdir"],"metadata.copy.csv"),
- fasta = os.path.join(config["outdir"],"alignment.downsample.fasta")
+ csv = os.path.join(config["outdir"],"metadata.final.csv")
run:
- downsample.downsample(
- input.csv,
- output.csv,
- input.fasta,
- output.fasta,
- 1, config["outgroups"],
- False,
- False,
- 10)
-
-rule filter_metadata:
- input:
- csv = os.path.join(config["outdir"],"metadata.copy.csv"),
- fasta = os.path.join(config["outdir"],"alignment.downsample.fasta")
- output:
- csv = os.path.join(config["outdir"],"metadata.downsample.csv")
- run:
- in_downsample = {}
+ in_list = {}
for record in SeqIO.parse(input.fasta,"fasta"):
- in_downsample[record.id] = 1
+ in_list[record.id] = 1
with open(output.csv, "w") as fw:
fw.write("sequence_name,lineage\n")
with open(input.csv, "r") as f:
reader = csv.DictReader(f)
for row in reader:
- if row["sequence_name"] in in_downsample:
+ if row["sequence_name"] in in_list:
name = row["sequence_name"]
lineage = row["lineage"]
fw.write(f"{name},{lineage}\n")
-
rule get_relevant_postions:
input:
- fasta = os.path.join(config["outdir"],"alignment.downsample.fasta"),
- csv = os.path.join(config["outdir"],"metadata.downsample.csv"),
+ fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+ csv = os.path.join(config["outdir"],"metadata.final.csv"),
reference = config["reference"]
- params:
- path_to_script = quokka_path
output:
relevant_pos_obj = os.path.join(config["outdir"],"relevantPositions.pickle"),
- shell:
- """
- python {params.path_to_script}/quokka/getRelevantLocationsObject.py \
- {input.reference:q} \
- {input.fasta} \
- {input.csv:q} \
- {config[outdir]}
- """
+ run:
+ get_relevant_positions(input.csv,input.fasta,input.reference,output.relevant_pos_obj)
-rule run_training:
+rule run_rf_training:
input:
- fasta = os.path.join(config["outdir"],"alignment.downsample.fasta"),
- csv = os.path.join(config["outdir"],"metadata.downsample.csv"),
+ fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+ csv = os.path.join(config["outdir"],"metadata.final.csv"),
reference = config["reference"],
relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
params:
path_to_script = pangoLEARN_path
output:
- headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
- model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
- txt = os.path.join(config["outdir"],"training_summary.txt")
+ headers = os.path.join(config["outdir"],"randomForestHeaders_v1.joblib"),
+ model = os.path.join(config["outdir"],"randomForest_v1.joblib"),
+ txt = os.path.join(config["outdir"],"training_summary.rf.txt")
shell:
"""
- python {params.path_to_script}/pangoLEARN/training/pangoLEARNDecisionTree_v1.py \
+ python {params.path_to_script}/pangoLEARN/training/pangoLEARNRandomForest_v1.py \
{input.csv:q} \
{input.fasta} \
{input.reference:q} \
@@ -261,27 +234,38 @@ rule run_training:
> {output.txt:q}
"""
-rule get_recall:
+rule run_dt_training:
input:
- txt = rules.run_training.output.txt
+ fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
+ csv = os.path.join(config["outdir"],"metadata.final.csv"),
+ reference = config["reference"],
+ relevant_pos_obj = rules.get_relevant_postions.output.relevant_pos_obj
params:
path_to_script = pangoLEARN_path
output:
- txt = os.path.join(config["outdir"],"lineage_recall_report.txt")
+ headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
+ model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
+ txt = os.path.join(config["outdir"],"training_summary.dt.txt")
shell:
"""
- python {params.path_to_script}/pangoLEARN/training/processOutputFile.py {input.txt} > {output.txt}
+ python {params.path_to_script}/pangoLEARN/training/pangoLEARNDecisionTree_v1.py \
+ {input.csv:q} \
+ {input.fasta} \
+ {input.reference:q} \
+ {config[outdir]} \
+ {input.relevant_pos_obj} \
+ > {output.txt:q}
"""
rule get_decisions:
input:
headers = os.path.join(config["outdir"],"decisionTreeHeaders_v1.joblib"),
model = os.path.join(config["outdir"],"decisionTree_v1.joblib"),
- txt = rules.run_training.output.txt
+ txt = rules.run_dt_training.output.txt
params:
path_to_script = pangoLEARN_path
output:
- txt = os.path.join(config["outdir"],"tree_rules.txt"),
+ txt = os.path.join(config["outdir"],"decision_tree_rules.txt"),
zipped = os.path.join(config["outdir"],"decision_tree_rules.zip")
shell:
"""
@@ -290,6 +274,18 @@ rule get_decisions:
> {output.txt:q} && zip {output.zipped:q} {output.txt:q}
"""
+# rule get_recall:
+# input:
+# txt = rules.run_rf_training.output.txt
+# params:
+# path_to_script = pangoLEARN_path
+# output:
+# txt = os.path.join(config["outdir"],"lineage_recall_report.txt")
+# shell:
+# """
+# python {params.path_to_script}/pangoLEARN/training/processOutputFile.py {input.txt} > {output.txt}
+# """
+
rule create_hash:
input:
fasta = os.path.join(config["outdir"],"alignment.filtered.fasta"),
=====================================
pangoLEARN/scripts/training_runner.sh
=====================================
@@ -12,7 +12,7 @@ echo $OUTDIR
if [ -d $OUTDIR ]
then
echo "Directory $OUTDIR exists."
-else
+els
mkdir $OUTDIR
echo "Directory $OUTDIR does not exist, making it."
fi
@@ -32,32 +32,22 @@ then
else
DATA_DATE=$2
fi
-echo "$UID training version $DATA_DATE"
+echo "training version $DATA_DATE"
# LATEST_DATA=$(ls -td /localdisk/home/shared/raccoon-dog/2021*_gisaid/publish/gisaid | head -n 1)
-REPO_PATH=/localdisk/home/$UID/repositories
+REPO_PATH=/localdisk/home/s1680070/repositories
PANGO_PATH=$REPO_PATH/pango-designation
PLEARN_PATH=$REPO_PATH/pangoLEARN
-PANGOLIN_PATH=$REPO_PATH/pangolin
-QUOKKA_PATH=$REPO_PATH/quokka
echo "pango designation path $PANGO_PATH"
echo "pangoLEARN path $PLEARN_PATH"
-echo "pangolin path $PANGOLIN_PATH"
-
cd $PANGO_PATH && git pull #gets any updates to the reports in the data directory
-PANGO_V=$(git tag --points-at HEAD)
-echo "pango version $PANGO_V"
-
+PANGO_VERSION=$(git describe --tags --abbrev=0)
+echo $PANGO_VERSION
cd /localdisk/home/shared/raccoon-dog/ #gets any updates to the reports in the data directory
-echo "--config outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_V"
-echo "pangoLEARN training starting" | mail -s "update lineageTree.pb with pango designation version $PANGO_V" angie at soe.ucsc.edu
-snakemake --snakefile $PLEARN_PATH/pangoLEARN/scripts/curate_alignment.smk --rerun-incomplete --nolock --cores 1 --config pango_designation_path=$PANGO_PATH pangolin_path=$PANGOLIN_PATH pangoLEARN_path=$PLEARN_PATH quokka_path=$QUOKKA_PATH outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_V
-
-# cp $OUTDIR/pangolearn.init.py /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/__init__.py
-# cp $OUTDIR/decision* /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/
-# cp $OUTDIR/metadata.downsample.csv /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/lineages.downsample.csv
-# cp $OUTDIR/lineage.hash.csv /localdisk/home/s1680070/repositories/pangoLEARN/pangoLEARN/data/lineages.hash.csv
+echo "--config outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY "
+echo "pangoLEARN training starting" | mail -s "update lineageTree.pb with pango designation version $PANGO_VERSION" angie at soe.ucsc.edu
+snakemake --snakefile $PLEARN_PATH/pangoLEARN/scripts/curate_alignment.smk --rerun-incomplete --nolock --cores 1 --config repo_path=$REPO_PATH outdir=$OUTDIR data_date=$DATA_DATE pangolearn_version=$TODAY pango_version=$PANGO_VERSION
=====================================
pangoLEARN/training/get_lineage_positions.py
=====================================
@@ -0,0 +1,52 @@
+import csv
+import pickle
+import collections
+from Bio import SeqIO
+from Bio.Align import MultipleSeqAlignment
+from Bio.Align.AlignInfo import SummaryInfo
+
+## outputs a dataframe of positions to input in training
+
+def get_lineage_cns50_sites(lineage, lineage_taxa, sequences_index, reference_sequence):
+
+ ### output of list of sequence_IDs for the given lineage
+
+ lineage_seqs = MultipleSeqAlignment([])
+ for taxon in lineage_taxa:
+ lineage_seqs.append(sequences_index[taxon])
+
+ info = SummaryInfo(lineage_seqs)
+ consensus_sequence = info.gap_consensus(
+ threshold=0.50,
+ ambiguous='N')
+
+ nuc_position = []
+ for i in range(len(consensus_sequence)):
+ if reference_sequence[i] != "N":
+ if consensus_sequence[i] != "N":
+ if consensus_sequence[i] != reference_sequence[i]:
+ nuc_position.append(i)
+ return nuc_position
+
+def get_relevant_positions(designation_file,seq_file,ref_file,outfile):
+ reference = SeqIO.read(ref_file, "fasta")
+ sequences_index = SeqIO.index(seq_file, "fasta")
+ lineage_designations = collections.defaultdict(list)
+ lineage_set = set()
+
+ with open(designation_file, "r") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ lineage_designations[row["lineage"]].append(row["sequence_name"])
+ lineage_set.add(row["lineage"])
+
+ final_positions = set()
+ for lineage in lineage_set:
+ print(f"Getting positions for lineage {lineage}")
+ positions = get_lineage_cns50_sites(lineage, lineage_designations[lineage], sequences_index, reference)
+ print(f"\tFound {len(positions)}")
+ for i in positions:
+ final_positions.add(i)
+
+ with open(outfile, 'wb') as pickle_file:
+ pickle.dump(final_positions, pickle_file)
\ No newline at end of file
=====================================
pangoLEARN/training/outgroups.csv deleted
=====================================
@@ -1,11 +0,0 @@
-lineage,outgroup
-A,Wuhan/WH04/2020
-B,Wuhan/WHU01/2020
-B.1,Italy/ABR-IZSGC-TE5166/2020
-B.1.1,Germany/BY-MVP-V2010837/2020
-B.1.177,Spain/VC-IBV-98006461/2020
-B.1.1.7,England/MILK-9E05B3/2020
-B.1.160,Belgium/rega-10021225/2020
-B.1.2,USA/TX-HMH-MCoV-16306/2020
-B.1.258,England/LEED-2A8A64/2020
-B.1.243,USA/TX-HMH-MCoV-18678/2020
=====================================
pangoLEARN/training/pangoLEARNDecisionTree_v1.py
=====================================
@@ -12,6 +12,8 @@ import os
from sklearn.model_selection import cross_val_score
from Bio import SeqIO
import pickle
+import time
+import os.path
# file with lineage assignments
lineage_file = sys.argv[1]
@@ -89,16 +91,9 @@ def readInAndFormatData():
dataList.append(getDataLine(referenceId, referenceSeq))
# create a dictionary of sequence ids to their assigned lineages
- with open(lineage_file, 'r') as f:
- for line in f:
- line = line.strip()
-
- split = line.split(",")
-
- idToLineage[split[0]] = split[1]
-
- # close the file
- f.close()
+ lineage_designations = pd.read_csv(lineage_file, delimiter=",", dtype=str)
+ for index, row in lineage_designations.iterrows():
+ idToLineage[row["sequence_name"]] = row["lineage"]
seq_dict = {rec.id : rec.seq for rec in SeqIO.parse(sequence_file, "fasta")}
@@ -301,15 +296,14 @@ h = feature_cols.pop(0)
X = pima[feature_cols]
y = pima[h]
-# separate the data frame into testing/training data sets. 25% of the data will be used for training, 75% for test.
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=testing_percentage,random_state=0)
print("training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
header_out = os.path.join(sys.argv[4],"decisionTreeHeaders_v1.joblib")
-joblib.dump(headers, header_out, compress=9)
+joblib.dump(headers, header_out, compress=('lzma', 9))
-# instantiate the random forest with 1000 trees
+# instantiate the random forest with 50 trees
dt = DecisionTreeClassifier()
# fit the model
@@ -340,7 +334,8 @@ print(metrics.classification_report(y_test, y_pred, digits=3))
# save the model files to compressed joblib files
# using joblib instead of pickle because these large files need to be compressed
model_out = os.path.join(sys.argv[4],"decisionTree_v1.joblib")
-joblib.dump(dt, model_out, compress=9)
+joblib.dump(dt, model_out, compress=('lzma', 9))
+
print("model files created", flush=True)
=====================================
pangoLEARN/training/pangoLEARNRandomForest_v1.py
=====================================
@@ -0,0 +1,355 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn import metrics
+from sklearn.datasets import make_classification
+from sklearn.model_selection import StratifiedShuffleSplit
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, make_scorer
+from datetime import datetime
+import joblib
+import sys
+import os
+from sklearn.model_selection import cross_val_score
+from Bio import SeqIO
+import pickle
+import time
+import os.path
+
+# file with lineage assignments
+lineage_file = sys.argv[1]
+# file with sequences
+sequence_file = sys.argv[2]
+# how much of the data will be used for testing, instead of training
+testing_percentage = 0.0000000001
+
+relevant_positions = pickle.load(open(sys.argv[5], 'rb'))
+relevant_positions.add(0)
+
+# the path to the reference file.
+# This reference sequence must be the same as is used in the pangolearn script!!
+referenceFile = sys.argv[3]
+
+# data storage
+dataList = []
+# dict for lookup efficiency
+indiciesToKeep = dict()
+
+referenceId = "Wuhan/WH04/2020"
+referenceSeq = ""
+
+idToLineage = dict()
+idToSeq = dict()
+
+mustKeepIds = []
+mustKeepLineages = []
+
+
+# function for handling weird sequence characters
+def clean(x, loc):
+ x = x.upper()
+
+ if x == 'T' or x == 'A' or x == 'G' or x == 'C' or x == '-':
+ return x
+
+ if x == 'U':
+ return 'T'
+
+ # otherwise return value from reference
+ return referenceSeq[loc]
+
+def findReferenceSeq():
+ with open(referenceFile) as f:
+ currentSeq = ""
+
+ for line in f:
+ if ">" not in line:
+ currentSeq = currentSeq + line.strip()
+
+ f.close()
+ return currentSeq
+
+
+def getDataLine(seqId, seq):
+ dataLine = []
+ dataLine.append(seqId)
+
+ newSeq = ""
+
+ # for each character in the sequence
+ for index in range(len(seq)):
+ newSeq = newSeq + clean(seq[index], index)
+
+ dataLine.append(newSeq)
+
+ return dataLine
+
+
+def readInAndFormatData():
+
+ # add the data line for the reference seq
+ idToLineage[referenceId] = "A"
+ dataList.append(getDataLine(referenceId, referenceSeq))
+
+ # create a dictionary of sequence ids to their assigned lineages
+ lineage_designations = pd.read_csv(lineage_file, delimiter=",", dtype=str)
+ for index, row in lineage_designations.iterrows():
+ idToLineage[row["sequence_name"]] = row["lineage"]
+
+ seq_dict = {rec.id : rec.seq for rec in SeqIO.parse(sequence_file, "fasta")}
+
+ print("files read in, now processing")
+
+ for key in seq_dict.keys():
+ if key in idToLineage:
+ dataList.append(getDataLine(key, seq_dict[key]))
+ else:
+ print("unable to find the lineage classification for: " + key)
+
+
+# find columns in the data list which always have the same value
+def findColumnsWithoutSNPs():
+
+ # for each index in the length of each sequence
+ for index in range(len(dataList[0][1])):
+ keep = False
+
+ # loop through all lines
+ for line in dataList:
+
+ # if there is a difference somewhere, then we want to keep it
+ if dataList[0][1][index] != line[1][index] or index == 0:
+ keep = True
+ break
+
+ # otherwise, save it
+ if keep and index in relevant_positions:
+ indiciesToKeep[index] = True
+
+
+# remove columns from the data list which don't have any SNPs. We do this because
+# these columns won't be relevant for a logistic regression which is trying to use
+# differences between sequences to assign lineages
+def removeOtherIndices(indiciesToKeep):
+
+ # instantiate the final list
+ finalList = []
+
+ indicies = list(indiciesToKeep.keys())
+ indicies.sort()
+
+ # while the dataList isn't empty
+ while len(dataList) > 0:
+
+ # pop the first line
+ line = dataList.pop(0)
+ seqId = line.pop(0)
+
+ line = line[0]
+ # initialize the finalLine
+ finalLine = []
+
+ for index in indicies:
+ if index == 0:
+ # if its the first index, then that's the lineage assignment, so keep it
+ finalLine.append(seqId)
+ else:
+ # otherwise keep everything at the indices in indiciesToKeep
+ finalLine.append(line[index])
+
+ # save the finalLine to the finalList
+ finalList.append(finalLine)
+
+ # return
+ return finalList
+
+def allEqual(list):
+ entries = dict()
+
+ for i in list:
+ if i not in entries:
+ entries[i] = True
+
+ return len(entries) == 1
+
+def removeAmbiguous():
+ idsToRemove = set()
+ lineMap = dict()
+ idMap = dict()
+
+ for line in dataList:
+ keyString = ",".join(line[1:])
+
+ if keyString not in lineMap:
+ lineMap[keyString] = []
+ idMap[keyString] = []
+
+ if line[0] in idToLineage:
+ lineMap[keyString].append(idToLineage[line[0]])
+ idMap[keyString].append(line[0])
+ else:
+ print("diagnostics")
+ print(line[0])
+ print(keyString)
+ print(line)
+ for key in lineMap:
+ if not allEqual(lineMap[key]):
+
+ skipRest = False
+
+ # see if any protected lineages are contained in the set, if so keep those ids
+ for lineage in lineMap[key]:
+ if lineage in mustKeepLineages:
+ skipRest = True
+
+ for i in idMap[key]:
+ if lineage != idToLineage[i] and i not in mustKeepIds:
+ idsToRemove.add(i)
+
+ # none of the lineages are protected, fire at will
+ if not skipRest:
+
+ lineageToCounts = dict()
+
+ aLineage = False
+ # find most common lineage
+ for lineage in lineMap[key]:
+ if lineage not in lineageToCounts:
+ lineageToCounts[lineage] = 0
+
+ lineageToCounts[lineage] = lineageToCounts[lineage] + 1
+ aLineage = lineage
+
+ m = aLineage
+ for lineage in lineageToCounts:
+ if lineageToCounts[lineage] > lineageToCounts[m]:
+ m = lineage
+
+
+ for i in idMap[key]:
+ if m != idToLineage[i]:
+ idsToRemove.add(i)
+
+ newList = []
+
+ print("keeping indicies:")
+
+ for line in dataList:
+ if line[0] not in idsToRemove:
+ print(line[0])
+ line[0] = idToLineage[line[0]]
+ newList.append(line)
+
+ return newList
+
+
+print("reading in data " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+referenceSeq = findReferenceSeq()
+
+readInAndFormatData()
+
+print("processing snps, formatting data " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+findColumnsWithoutSNPs()
+
+dataList = removeOtherIndices(indiciesToKeep)
+
+print("# sequences before blacklisting")
+print(len(dataList))
+
+dataList = removeAmbiguous()
+
+print("# sequences after blacklisting")
+print(len(dataList))
+
+# headers are the original genome locations
+headers = list(indiciesToKeep.keys())
+headers[0] = "lineage"
+
+print("setting up training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+# checkpoint_file = os.path.join(sys.argv[4], "dataList.pickle")
+# dataList.to_pickle(checkpoint_file)
+# headers
+
+pima = pd.DataFrame(dataList, columns=headers)
+
+# nucleotide symbols which can appear
+categories = ['A', 'C', 'G', 'T', '-']
+
+# one hot encoding of all headers other than the first which is the lineage
+dummyHeaders = headers[1:]
+
+# add extra rows to ensure all of the categories are represented, as otherwise
+# not enough columns will be created when we call get_dummies
+for i in categories:
+ line = [i] * len(dataList[0])
+ pima.loc[len(pima)] = line
+
+# get one-hot encoding
+pima = pd.get_dummies(pima, columns=dummyHeaders)
+
+# get rid of the fake data we just added
+pima.drop(pima.tail(len(categories)).index, inplace=True)
+
+feature_cols = list(pima)
+print(feature_cols)
+
+# remove the last column from the data frame. This is because we are trying to predict these values.
+h = feature_cols.pop(0)
+X = pima[feature_cols]
+y = pima[h]
+
+X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=testing_percentage,random_state=0)
+
+print("training " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+header_out = os.path.join(sys.argv[4],"randomForestHeaders_v1.joblib")
+joblib.dump(headers, header_out, compress=('lzma', 9))
+
+# instantiate the random forest with 50 trees
+rf = RandomForestClassifier(n_estimators=12, max_features=0.05, min_samples_split=10)
+
+# fit the model
+rf.fit(X,y)
+
+print("testing " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+# classify the test data
+y_pred=rf.predict(X_test)
+
+print(y_pred)
+
+# get the scores from these predictions
+y_scores = rf.predict_proba(X_test)
+
+print("generating statistics " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"), flush=True)
+
+#print the confusion matrix
+print("--------------------------------------------")
+print("Confusion Matrix")
+cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
+print(cnf_matrix)
+
+print("--------------------------------------------")
+print("Classification report")
+print(metrics.classification_report(y_test, y_pred, digits=3))
+
+# save the model files to compressed joblib files
+# using joblib instead of pickle because these large files need to be compressed
+model_out = os.path.join(sys.argv[4],"randomForest_v1.joblib")
+joblib.dump(rf, model_out, compress=('lzma', 9))
+
+
+print("model files created", flush=True)
+
+# this method is used below when running 10-fold cross validation. It ensures
+# that the per-lineage statistics are generated for each cross-fold
+def classification_report_with_accuracy_score(y_true, y_pred):
+ print("--------------------------------------------")
+ print("Crossfold Classification Report")
+ print(metrics.classification_report(y_true, y_pred, digits=3))
+ return accuracy_score(y_true, y_pred)
+
+# optionally, run 10-fold cross validation (comment this out if not needed as it takes a while to run)
+# cross_validation_scores = cross_val_score(rf, X=X, y=y, cv=10, scoring=make_scorer(classification_report_with_accuracy_score))
=====================================
pangoLEARN/training/processOutputFile.py
=====================================
@@ -26,6 +26,7 @@ with open(outputfile, 'r') as f:
split = line.split()
if "macro avg" in line or "weighted avg" in line:
+
name = split[0] + " " + split[1]
if name not in nameToLineage:
@@ -36,7 +37,7 @@ with open(outputfile, 'r') as f:
nameToLineage[name].f1s.append(float(split[4]))
nameToLineage[name].supports.append(int(split[5]))
- if len(split) == 5 and ":" not in line:
+ if len(split) == 5 and ":" not in line and "read" not in line:
name = split[0]
if name not in nameToLineage:
@@ -54,4 +55,4 @@ for key in nameToLineage:
nameToLineage[key].printStats()
nameToLineage["macro avg"].printStats()
-nameToLineage["weighted avg"].printStats()
\ No newline at end of file
+nameToLineage["weighted avg"].printStats()
=====================================
pangoLEARN/training/reference.fasta
=====================================
@@ -0,0 +1,2 @@
+>outgroup_A
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAGATGGCACTTGTGGCTTAGTAGAAGTTGAAAAAGGCGTTTTGCCTCAACTTGAACAGCCCTATGTGTTCATCAAACGTTCGGATGCTCGAACTGCACCTCATGGTCATGTTATGGTTGAGCTGGTAGCAGAACTCGAAGGCATTCAGTACGGTCGTAGTGGTGAGACACTTGGTGTCCTTGTCCCTCATGTGGGCGAAATACCAGTGGCTTACCGCAAGGTTCTTCTTCGTAAGAACGGTAATAAAGGAGCTGGTGGCCATAGTTACGGCGCCGATCTAAAGTCATTTGACTTAGGCGACGAGCTTGGCACTGATCCTTATGAAGATTTTCAAGAAAACTGGAACACTAAACATAGCAGTGGTGTTACCCGTGAACTCATGCGTGAGCTTAACGGAGGGGCATACACTCGCTATGTCGATAACAACTTCTGTGGCCCTGATGGCTACCCTCTTGAGTGCATTAAAGACCTTCTAGCACGTGCTGGTAAAGCTTCATGCACTTTGTCCGAACAACTGGACTTTATTGACACTAAGAGGGGTGTATACTGCTGCCGTGAACATGAGCATGAAATTGCTTGGTACACGGAACGTTCTGAAAAGAGCTATGAATTGCAGACACCTTTTGAAATTAAATTGGCAAAGAAATTTGACACCTTCAATGGGGAATGTCCAAATTTTGTATTTCCCTTAAATTCCATAATCAAGACTATTCAACCAAGGGTTGAAAAGAAAAAGCTTGATGGCTTTATGGGTAGAATTCGATCTGTCTATCCAGTTGCGTCACCAAATGAATGCAACCAAATGTGCCTTTCAACTCTCATGAAGTGTGATCATTGTGGTGAAACTTCATGGCAGACGGGCGATTTTGTTAAAGCCACTTGCGAATTTTGTGGCACTGAGAATTTGACTAAAGAAGGTGCCACTACTTGTGGTTACTTACCCCAAAATGCTGTTGTTAAAATTTATTGTCCAGCATGTCACAATTCAGAAGTAGGACCTGAGCATAGTCTTGCCGAATACCATAATGAATCTGGCTTGAAAACCATTCTTCGTAAGGGTGGTCGCACTATTGCCTTTGGAGGCTGTGTGTTCTCTTATGTTGGTTGCCATAACAAGTGTGCCTATTGGGTTCCACGTGCTAGCGCTAACATAGGTTGTAACCATACAGGTGTTGTTGGAGAAGGTTCCGAAGGTCTTAATGACAACCTTCTTGAAATACTCCAAAAAGAGAAAGTCAACATCAATATTGTTGGTGACTTTAAACTTAATGAAGAGATCGCCATTATTTTGGCATCTTTTTCTGCTTCCACAAGTGCTTTTGTGGAAACTGTGAAAGGTTTGGATTATAAAGCATTCAAACAAATTGTTGAATCCTGTGGTAATTTTAAAGTTACAAAAGGAAAAGCTAAAAAAGGTGCCTGGAATATTGGTGAACAGAAATCAATACTGAGTCCTCTTTATGCATTTGCATCAGAGGCTGCTCGTGTTGTACGATCAATTTTCTCCCGCACTCTTGAAACTGCTCAAAATTCTGTGCGTGTTTTACAGAAGGCCGCTATAACAATACTAGATGGAATTTCACAGTATTCACTGAGACTCATTGATGCTATGATGTTCACATCTGATTTGGCTACTAACAATCTAGTTGTAATGGCCTACATTACAGGTGGTGTTGTTCAGTTGACTTCGCAGTGGCTAACTAACATCTTTGGCACTGTTTATGAAAAACTCAAACCCGTCCTTGATTGGCTTGAAGAGAAGTTTAAGGAAGGTGTAGAGTTTCTTAGAGACGGTTGGGAAATTGTTAAATTTATCTCAACCTGTGCTTGTGAAATTGTCGGTGGACAAATTGTCACCTGTGCAAAGGAAATTAAGGAGAGTGTTCAGACATTCTTTAAGCTTGTAAATAAATTTTTGGCTTTGTGTGCTGACTCTATCATTATTGGTGGAGCTAAACTTAAAGCCTTGAATTTAGGTGAAACATTTGTCACGCACTCAAAGGGATTGTACAGAAAGTGTGTTAAATCCAGAGAAGAAACTGGCCTACTCATGCCTCTAAAAGCCCCAAAAGAAATTATCTTCTTAGAGGGAGAAACACTTCCCACAGAAGTGTTAACAGAGGAAGTTGTCTTGAAAACTGGTGATTTACAACCATTAGAACAACCTACTAGTGAAGCTGTTGAAGCTCCATTGGTTGGTACACCAGTTTGTATTAACGGGCTTATGTTGCTCGAAATCAAAGACACAGAAAAGTACTGTGCCCTTGCACCTAATATGATGGTAACAAACAATACCTTCACACTCAAAGGCGGTGCACCAACAAAGGTTACTTTTGGTGATGACACTGTGATAGAAGTGCAAGGTTACAAGAGTGTGAATATCACTTTTGAACTTGATGAAAGGATTGATAAAGTACTTAATGAGAAGTGCTCTGCCTATACAGTTGAACTCGGTACAGAAGTAAATGAGTTCGCCTGTGTTGTGGCAGATGCTGTCATAAAAACTTTGCAACCAGTATCTGAATTACTTACACCACTGGGCATTGATTTAGATGAGTGGAGTATGGCTACATACTACTTATTTGATGAGTCTGGTGAGTTTAAATTGGCTTCACATATGTATTGTTCTTTCTACCCTCCAGATGAGGATGAAGAAGAAGGTGATTGTGAAGAAGAAGAGTTTGAGCCATCAACTCAATATGAGTATGGTACTGAAGATGATTACCAAGGTAAACCTTTGGAATTTGGTGCCACTTCTGCTGCTCTTCAACCTGAAGAAGAGCAAGAAGAAGATTGGTTAGATGATGATAGTCAACAAACTGTTGGTCAACAAGACGGCAGTGAGGACAATCAGACAACTACTATTCAAACAATTGTTGAGGTTCAACCTCAATTAGAGATGGAACTTACACCAGTTGTTCAGACTATTGAAGTGAATAGTTTTAGTGGTTATTTAAAACTTACTGACAATGTATACATTAAAAATGCAGACATTGTGGAAGAAGCTAAAAAGGTAAAACCAACAGTGGTTGTTAATGCAGCCAATGTTTACCTTAAACATGGAGGAGGTGTTGCAGGAGCCTTAAATAAGGCTACTAACAATGCCATGCAAGTTGAATCTGATGATTACATAGCTACTAATGGACCACTTAAAGTGGGTGGTAGTTGTGTTTTAAGCGGACACAATCTTGCTAAACACTGTCTTCATGTTGTCGGCCCAAATGTTAACAAAGGTGAAGACATTCAACTTCTTAAGAGTGCTTATGAAAATTTTAATCAGCACGAAGTTCTACTTGCACCATTATTATCAGCTGGTATTTTTGGTGCTGACCCTATACATTCTTTAAGAGTTTGTGTAGATACTGTTCGCACAAATGTCTACTTAGCTGTCTTTGATAAAAATCTCTATGACAAACTTGTTTCAAGCTTTTTGGAAATGAAGAGTGAAAAGCAAGTTGAACAAAAGATCGCTGAGATTCCTAAAGAGGAAGTTAAGCCATTTATAACTGAAAGTAAACCTTCAGTTGAACAGAGAAAACAAGATGATAAGAAAATCAAAGCTTGTGTTGAAGAAGTTACAACAACTCTGGAAGAAACTAAGTTCCTCACAGAAAACTTGTTACTTTATATTGACATTAATGGCAATCTTCATCCAGATTCTGCCACTCTTGTTAGTGACATTGACATCACTTTCTTAAAGAAAGATGCTCCATATATAGTGGGTGATGTTGTTCAAGAGGGTGTTTTAACTGCTGTGGTTATACCTACTAAAAAGGCTGGTGGCACTACTGAAATGCTAGCGAAAGCTTTGAGAAAAGTGCCAACAGACAATTATATAACCACTTACCCGGGTCAGGGTTTAAATGGTTACACTGTAGAGGAGGCAAAGACAGTGCTTAAAAAGTGTAAAAGTGCCTTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCGAGAAATGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGGAAACTAAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACACTTAACGATCTAAATGAAACTCTTGTTACAATGCCACTTGGCTATGTAACACATGGCTTAAATTTGGAAGAAGCTGCTCGGTATATGAGATCTCTCAAAGTGCCAGCTACAGTTTCTGTTTCTTCACCTGATGCTGTTACAGCGTATAATGGTTATCTTACTTCTTCTTCTAAAACACCTGAAGAACATTTTATTGAAACCATCTCACTTGCTGGTTCCTATAAAGATTGGTCCTATTCTGGACAATCTACACAACTAGGTATAGAATTTCTTAAGAGAGGTGATAAAAGTGTATATTACACTAGTAATCCTACCACATTCCACCTAGATGGTGAAGTTATCACCTTTGACAATCTTAAGACACTTCTTTCTTTGAGAGAAGTGAGGACTATTAAGGTGTTTACAACAGTAGACAACATTAACCTCCACACGCAAGTTGTGGACATGTCAATGACATATGGACAACAGTTTGGTCCAACTTATTTGGATGGAGCTGATGTTACTAAAATAAAACCTCATAATTCACATGAAGGTAAAACATTTTATGTTTTACCTAATGATGACACTCTACGTGTTGAGGCTTTTGAGTACTACCACACAACTGATCCTAGTTTTCTGGGTAGGTACATGTCAGCATTAAATCACACTAAAAAGTGGAAATACCCACAAGTTAATGGTTTAACTTCTATTAAATGGGCAGATAACAACTGTTATCTTGCCACTGCATTGTTAACACTCCAACAAATAGAGTTGAAGTTTAATCCACCTGCTCTACAAGATGCTTATTACAGAGCAAGGGCTGGTGAAGCTGCTAACTTTTGTGCACTTATCTTAGCCTACTGTAATAAGACAGTAGGTGAGTTAGGTGATGTTAGAGAAACAATGAGTTACTTGTTTCAACATGCCAATTTAGATTCTTGCAAAAGAGTCTTGAACGTGGTGTGTAAAACTTGTGGACAACAGCAGACAACCCTTAAGGGTGTAGAAGCTGTTATGTACATGGGCACACTTTCTTATGAACAATTTAAGAAAGGTGTTCAGATACCTTGTACGTGTGGTAAACAAGCTACAAAATATCTAGTACAACAGGAGTCACCTTTTGTTATGATGTCAGCACCACCTGCTCAGTATGAACTTAAGCATGGTACATTTACTTGTGCTAGTGAGTACACTGGTAATTACCAGTGTGGTCACTATAAACATATAACTTCTAAAGAAACTTTGTATTGCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGTTACACAACAACCATAAAACCAGTTACTTATAAATTGGATGGTGTTGTTTGTACAGAAATTGACCCTAAGTTGGACAATTATTATAAGAAAGACAATTCTTATTTCACAGAGCAACCAATTGATCTTGTACCAAACCAACCATATCCAAACGCAAGCTTCGATAATTTTAAGTTTGTATGTGATAATATCAAATTTGCTGATGATTTAAACCAGTTAACTGGTTATAAGAAACCTGCTTCAAGAGAGCTTAAAGTTACATTTTTCCCTGACTTAAATGGTGATGTGGTGGCTATTGATTATAAACACTACACACCCTCTTTTAAGAAAGGAGCTAAATTGTTACATAAACCTATTGTTTGGCATGTTAACAATGCAACTAATAAAGCCACGTATAAACCAAATACCTGGTGTATACGTTGTCTTTGGAGCACAAAACCAGTTGAAACATCAAATTCGTTTGATGTACTGAAGTCAGAGGACGCGCAGGGAATGGATAATCTTGCCTGCGAAGATCTAAAACCAGTCTCTGAAGAAGTAGTGGAAAATCCTACCATACAGAAAGACGTTCTTGAGTGTAATGTGAAAACTACCGAAGTTGTAGGAGACATTATACTTAAACCAGCAAATAATAGTTTAAAAATTACAGAAGAGGTTGGCCACACAGATCTAATGGCTGCTTATGTAGACAATTCTAGTCTTACTATTAAGAAACCTAATGAATTATCTAGAGTATTAGGTTTGAAAACCCTTGCTACTCATGGTTTAGCTGCTGTTAATAGTGTCCCTTGGGATACTATAGCTAATTATGCTAAGCCTTTTCTTAACAAAGTTGTTAGTACAACTACTAACATAGTTACACGGTGTTTAAACCGTGTTTGTACTAATTATATGCCTTATTTCTTTACTTTATTGCTACAATTGTGTACTTTTACTAGAAGTACAAATTCTAGAATTAAAGCATCTATGCCGACTACTATAGCAAAGAATACTGTTAAGAGTGTCGGTAAATTTTGTCTAGAGGCTTCATTTAATTATTTGAAGTCACCTAATTTTTCTAAACTGATAAATATTATAATTTGGTTTTTACTATTAAGTGTTTGCCTAGGTTCTTTAATCTACTCAACCGCTGCTTTAGGTGTTTTAATGTCTAATTTAGGCATGCCTTCTTACTGTACTGGTTACAGAGAAGGCTATTTGAACTCTACTAATGTCACTATTGCAACCTACTGTACTGGTTCTATACCTTGTAGTGTTTGTCTTAGTGGTTTAGATTCTTTAGACACCTATCCTTCTTTAGAAACTATACAAATTACCATTTCATCTTTTAAATGGGATTTAACTGCTTTTGGCTTAGTTGCAGAGTGGTTTTTGGCATATATTCTTTTCACTAGGTTTTTCTATGTACTTGGATTGGCTGCAATCATGCAATTGTTTTTCAGCTATTTTGCAGTACATTTTATTAGTAATTCTTGGCTTATGTGGTTAATAATTAATCTTGTACAAATGGCCCCGATTTCAGCTATGGTTAGAATGTACATCTTCTTTGCATCATTTTATTATGTATGGAAAAGTTATGTGCATGTTGTAGACGGTTGTAATTCATCAACTTGTATGATGTGTTACAAACGTAATAGAGCAACAAGAGTCGAATGTACAACTATTGTTAATGGTGTTAGAAGGTCCTTTTATGTCTATGCTAATGGAGGTAAAGGCTTTTGCAAACTACACAATTGGAATTGTGTTAATTGTGATACATTCTGTGCTGGTAGTACATTTATTAGTGATGAAGTTGCGAGAGACTTGTCACTACAGTTTAAAAGACCAATAAATCCTACTGACCAGTCTTCTTACATCGTTGATAGTGTTACAGTGAAGAATGGTTCCATCCATCTTTACTTTGATAAAGCTGGTCAAAAGACTTATGAAAGACATTCTCTCTCTCATTTTGTTAACTTAGACAACCTGAGAGCTAATAACACTAAAGGTTCATTGCCTATTAATGTTATAGTTTTTGATGGTAAATCAAAATGTGAAGAATCATCTGCAAAATCAGCGTCTGTTTACTACAGTCAGCTTATGTGTCAACCTATACTGTTACTAGATCAGGCATTAGTGTCTGATGTTGGTGATAGTGCGGAAGTTGCAGTTAAAATGTTTGATGCTTACGTTAATACGTTTTCATCAACTTTTAACGTACCAATGGAAAAACTCAAAACACTAGTTGCAACTGCAGAAGCTGAACTTGCAAAGAATGTGTCCTTAGACAATGTCTTATCTACTTTTATTTCAGCAGCTCGGCAAGGGTTTGTTGATTCAGATGTAGAAACTAAAGATGTTGTTGAATGTCTTAAATTGTCACATCAATCTGACATAGAAGTTACTGGCGATAGTTGTAATAACTATATGCTCACCTATAACAAAGTTGAAAACATGACACCCCGTGACCTTGGTGCTTGTATTGACTGTAGTGCGCGTCATATTAATGCGCAGGTAGCAAAAAGTCACAACATTGCTTTGATATGGAACGTTAAAGATTTCATGTCATTGTCTGAACAACTACGAAAACAAATACGTAGTGCTGCTAAAAAGAATAACTTACCTTTTAAGTTGACATGTGCAACTACTAGACAAGTTGTTAATGTTGTAACAACAAAGATAGCACTTAAGGGTGGTAAAATTGTTAATAATTGGTTGAAGCAGTTAATTAAAGTTACACTTGTGTTCCTTTTTGTTGCTGCTATTTTCTATTTAATAACACCTGTTCATGTCATGTCTAAACATACTGACTTTTCAAGTGAAATCATAGGATACAAGGCTATTGATGGTGGTGTCACTCGTGACATAGCATCTACAGATACTTGTTTTGCTAACAAACATGCTGATTTTGACACATGGTTTAGTCAGCGTGGTGGTAGTTATACTAATGACAAAGCTTGCCCATTGATTGCTGCAGTCATAACAAGAGAAGTGGGTTTTGTCGTGCCTGGTTTGCCTGGCACGATATTACGCACAACTAATGGTGACTTTTTGCATTTCTTACCTAGAGTTTTTAGTGCAGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGATGCTTCTGGTAAGCCAGTACCATATTGTTATGATACCAATGTACTAGAAGGTTCTGTTGCTTATGAAAGTTTACGCCCTGACACACGTTATGTGCTCATGGATGGCTCTATTATTCAATTTCCTAACACCTACCTTGAAGGTTCTGTTAGAGTGGTAACAACTTTTGATTCTGAGTACTGTAGGCACGGCACTTGTGAAAGATCAGAAGCTGGTGTTTGTGTATCTACTAGTGGTAGATGGGTACTTAACAATGATTATTACAGATCTTTACCAGGAGTTTTCTGTGGTGTAGATGCTGTAAATTTACTTACTAATATGTTTACACCACTAATTCAACCTATTGGTGCTTTGGACATATCAGCATCTATAGTAGCTGGTGGTATTGTAGCTATCGTAGTAACATGCCTTGCCTACTATTTTATGAGGTTTAGAAGAGCTTTTGGTGAATACAGTCATGTAGTTGCCTTTAATACTTTACTATTCCTTATGTCATTCACTGTACTCTGTTTAACACCAGTTTACTCATTCTTACCTGGTGTTTATTCTGTTATTTACTTGTACTTGACATTTTATCTTACTAATGATGTTTCTTTTTTAGCACATATTCAGTGGATGGTTATGTTCACACCTTTAGTACCTTTCTGGATAACAATTGCTTATATCATTTGTATTTCCACAAAGCATTTCTATTGGTTCTTTAGTAATTACCTAAAGAGACGTGTAGTCTTTAATGGTGTTTCCTTTAGTACTTTTGAAGAAGCTGCGCTGTGCACCTTTTTGTTAAATAAAGAAATGTATCTAAAGTTGCGTAGTGATGTGCTATTACCTCTTACGCAATATAATAGATACTTAGCTCTTTATAATAAGTACAAGTATTTTAGTGGAGCAATGGATACAACTAGCTACAGAGAAGCTGCTTGTTGTCATCTCGCAAAGGCTCTCAATGACTTCAGTAACTCAGGTTCTGATGTTCTTTACCAACCACCACAAACCTCTATCACCTCAGCTGTTTTGCAGAGTGGTTTTAGAAAAATGGCATTCCCATCTGGTAAAGTTGAGGGTTGTATGGTACAAGTAACTTGTGGTACAACTACACTTAACGGTCTTTGGCTTGATGACGTAGTTTACTGTCCAAGACATGTGATCTGCACCTCTGAAGACATGCTTAACCCTAATTATGAAGATTTACTCATTCGTAAGTCTAATCATAATTTCTTGGTACAGGCTGGTAATGTTCAACTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTTAAGCTTAAGGTTGATACAGCCAATCCTAAGACACCTAAGTATAAGTTTGTTCGCATTCAACCAGGACAGACTTTTTCAGTGTTAGCTTGTTACAATGGTTCACCATCTGGTGTTTACCAATGTGCTATGAGGCCCAATTTCACTATTAAGGGTTCATTCCTTAATGGTTCATGTGGTAGTGTTGGTTTTAACATAGATTATGACTGTGTCTCTTTTTGTTACATGCACCATATGGAATTACCAACTGGAGTTCATGCTGGCACAGACTTAGAAGGTAACTTTTATGGACCTTTTGTTGACAGGCAAACAGCACAAGCAGCTGGTACGGACACAACTATTACAGTTAATGTTTTAGCTTGGTTGTACGCTGCTGTTATAAATGGAGACAGGTGGTTTCTCAATCGATTTACCACAACTCTTAATGACTTTAACCTTGTGGCTATGAAGTACAATTATGAACCTCTAACACAAGACCATGTTGACATACTAGGACCTCTTTCTGCTCAAACTGGAATTGCCGTTTTAGATATGTGTGCTTCATTAAAAGAATTACTGCAAAATGGTATGAATGGACGTACCATATTGGGTAGTGCTTTATTAGAAGATGAATTTACACCTTTTGATGTTGTTAGACAATGCTCAGGTGTTACTTTCCAAAGTGCAGTGAAAAGAACAATCAAGGGTACACACCACTGGTTGTTACTCACAATTTTGACTTCACTTTTAGTTTTAGTCCAGAGTACTCAATGGTCTTTGTTCTTTTTTTTGTATGAAAATGCCTTTTTACCTTTTGCTATGGGTATTATTGCTATGTCTGCTTTTGCAATGATGTTTGTCAAACATAAGCATGCATTTCTCTGTTTGTTTTTGTTACCTTCTCTTGCCACTGTAGCTTATTTTAATATGGTCTATATGCCTGCTAGTTGGGTGATGCGTATTATGACATGGTTGGATATGGTTGATACTAGTTTGTCTGGTTTTAAGCTAAAAGACTGTGTTATGTATGCATCAGCTGTAGTGTTACTAATCCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTCTAACTACTCAGGTGTAGTTACAACTGTCATGTTTTTGGCCAGAGGTATTGTTTTTATGTGTGTTGAGTATTGCCCTATTTTCTTCATAACTGGTAATACACTTCAGTGTATAATGCTAGTTTATTGTTTCTTAGGCTATTTTTGTACTTGTTACTTTGGCCTCTTTTGTTTACTCAACCGCTACTTTAGACTGACTCTTGGTGTTTATGATTACTTAGTTTCTACACAGGAGTTTAGATATATGAATTCACAGGGACTACTCCCACCCAAGAATAGCATAGATGCCTTCAAACTCAACATTAAATTGTTGGGTGTTGGTGGCAAACCTTGTATCAAAGTAGCCACTGTACAGTCTAAAATGTCAGATGTAAAGTGCACATCAGTAGTCTTACTCTCAGTTTTGCAACAACTCAGAGTAGAATCATCATCTAAATTGTGGGCTCAATGTGTCCAGTTACACAATGACATTCTCTTAGCTAAAGATACTACTGAAGCCTTTGAAAAAATGGTTTCACTACTTTCTGTTTTGCTTTCCATGCAGGGTGCTGTAGACATAAACAAGCTTTGTGAAGAAATGCTGGACAACAGGGCAACCTTACAAGCTATAGCCTCAGAGTTTAGTTCCCTTCCATCATATGCAGCTTTTGCTACTGCTCAAGAAGCTTATGAGCAGGCTGTTGCTAATGGTGATTCTGAAGTTGTTCTTAAAAAGTTGAAGAAGTCTTTGAATGTGGCTAAATCTGAATTTGACCGTGATGCAGCCATGCAACGTAAGTTGGAAAAGATGGCTGATCAAGCTATGACCCAAATGTATAAACAGGCTAGATCTGAGGACAAGAGGGCAAAAGTTACTAGTGCTATGCAGACAATGCTTTTCACTATGCTTAGAAAGTTGGATAATGATGCACTCAACAACATTATCAACAATGCAAGAGATGGTTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTGTGATGGTACAACATTTACTTATGCATCAGCATTGTGGGAAATCCAACAGGTTGTAGATGCAGATAGTAAAATTGTTCAACTTAGTGAAATTAGTATGGACAATTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATTCTGCTGTCAAATTACAGAATAATGAGCTTAGTCCTGTTGCACTACGACAGATGTCTTGTGCTGCCGGTACTACACAAACTGCTTGCACTGATGACAATGCGTTAGCTTACTACAACACAACAAAGGGAGGTAGGTTTGTACTTGCACTGTTATCCGATTTACAGGATTTGAAATGGGCTAGATTCCCTAAGAGTGATGGAACTGGTACTATCTATACAGAACTGGAACCACCTTGTAGGTTTGTTACAGACACACCTAAAGGTCCTAAAGTGAAGTATTTATACTTTATTAAAGGATTAAACAACCTAAATAGAGGTATGGTACTTGGTAGTTTAGCTGCCACAGTACGTCTACAAGCTGGTAATGCAACAGAAGTGCCTGCCAATTCAACTGTATTATCTTTCTGTGCTTTTGCTGTAGATGCTGCTAAAGCTTACAAAGATTATCTAGCTAGTGGGGGACAACCAATCACTAATTGTGTTAAGATGTTGTGTACACACACTGGTACTGGTCAGGCAATAACAGTTACACCGGAAGCCAATATGGATCAAGAATCCTTTGGTGGTGCATCGTGTTGTCTGTACTGCCGTTGCCACATAGATCATCCAAATCCTAAAGGATTTTGTGACTTAAAAGGTAAGTATGTACAAATACCTACAACTTGTGCTAATGACCCTGTGGGTTTTACACTTAAAAACACAGTCTGTACCGTCTGCGGTATGTGGAAAGGTTATGGCTGTAGTTGTGATCAACTCCGCGAACCCATGCTTCAGTCAGCTGATGCACAATCGTTTTTAAACGGGTTTGCGGTGTAAGTGCAGCCCGTCTTACACCGTGCGGCACAGGCACTAGTACTGATGTCGTATACAGGGCTTTTGACATCTACAATGATAAAGTAGCTGGTTTTGCTAAATTCCTAAAAACTAATTGTTGTCGCTTCCAAGAAAAGGACGAAGATGACAATTTAATTGATTCTTACTTTGTAGTTAAGAGACACACTTTCTCTAACTACCAACATGAAGAAACAATTTATAATTTACTTAAGGATTGTCCAGCTGTTGCTAAACATGACTTCTTTAAGTTTAGAATAGACGGTGACATGGTACCACATATATCACGTCAACGTCTTACTAAATACACAATGGCAGACCTCGTCTATGCTTTAAGGCATTTTGATGAAGGTAATTGTGACACATTAAAAGAAATACTTGTCACATACAATTGTTGTGATGATGATTATTTCAATAAAAAGGACTGGTATGATTTTGTAGAAAACCCAGATATATTACGCGTATACGCCAACTTAGGTGAACGTGTACGCCAAGCTTTGTTAAAAACAGTACAATTCTGTGATGCCATGCGAAATGCTGGTATTGTTGGTGTACTGACATTAGATAATCAAGATCTCAATGGTAACTGGTATGATTTCGGTGATTTCATACAAACCACGCCAGGTAGTGGAGTTCCTGTTGTAGATTCTTATTATTCATTGTTAATGCCTATATTAACCTTGACCAGGGCTTTAACTGCAGAGTCACATGTTGACACTGACTTAACAAAGCCTTACATTAAGTGGGATTTGTTAAAATATGACTTCACGGAAGAGAGGTTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATGACAGATGCATTCTGCATTGTGCAAACTTTAATGTTTTATTCTCTACAGTGTTCCCACCTACAAGTTTTGGACCACTAGTGAGAAAAATATTTGTTGATGGTGTTCCATTTGTAGTTTCAACTGGATACCACTTCAGAGAGCTAGGTGTTGTACATAATCAGGATGTAAACTTACATAGCTCTAGACTTAGTTTTAAGGAATTACTTGTGTATGCTGCTGACCCTGCTATGCACGCTGCTTCTGGTAATCTATTACTAGATAAACGCACTACGTGCTTTTCAGTAGCTGCACTTACTAACAATGTTGCTTTTCAAACTGTCAAACCCGGTAATTTTAACAAAGACTTCTATGACTTTGCTGTGTCTAAGGGTTTCTTTAAGGAAGGAAGTTCTGTTGAATTAAAACACTTCTTCTTTGCTCAGGATGGTAATGCTGCTATCAGCGATTATGACTACTATCGTTATAATCTACCAACAATGTGTGATATCAGACAACTACTATTTGTAGTTGAAGTTGTTGATAAGTACTTTGATTGTTACGATGGTGGCTGTATTAATGCTAACCAAGTCATCGTCAACAACCTAGACAAATCAGCTGGTTTTCCATTTAATAAATGGGGTAAGGCTAGACTTTATTATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAAAACGTAATGTCATCCCTACTATAACTCAAATGAATCTTAAGTATGCCATTAGTGCAAAGAATAGAGCTCGCACCGTAGCTGGTGTCTCTATCTGTAGTACTATGACCAATAGACAGTTTCATCAAAAATTATTGAAATCAATAGCCGCCACTAGAGGAGCTACTGTAGTAATTGGAACAAGCAAATTCTATGGTGGTTGGCACAACATGTTAAAAACTGTTTATAGTGATGTAGAAAACCCTCACCTTATGGGTTGGGATTATCCTAAATGTGATAGAGCCATGCCTAACATGCTTAGAATTATGGCCTCACTTGTTCTTGCTCGCAAACATACAACGTGTTGTAGCTTGTCACACCGTTTCTATAGATTAGCTAATGAGTGTGCTCAAGTATTGAGTGAAATGGTCATGTGTGGCGGTTCACTATATGTTAAACCAGGTGGAACCTCATCAGGAGATGCCACAACTGCTTATGCTAATAGTGTTTTTAACATTTGTCAAGCTGTCACGGCCAATGTTAATGCACTTTTATCTACTGATGGTAACAAAATTGCCGATAAGTATGTCCGCAATTTACAACACAGACTTTATGAGTGTCTCTATAGAAATAGAGATGTTGACACAGACTTTGTGAATGAGTTTTACGCATATTTGCGTAAACATTTCTCAATGATGATACTCTCTGACGATGCTGTTGTGTGTTTCAATAGCACTTATGCATCTCAAGGTCTAGTGGCTAGCATAAAGAACTTTAAGTCAGTTCTTTATTATCAAAACAATGTTTTTATGTCTGAAGCAAAATGTTGGACTGAGACTGACCTTACTAAAGGACCTCATGAATTTTGCTCTCAACATACAATGCTAGTTAAACAGGGTGATGATTATGTGTACCTTCCTTACCCAGATCCATCAAGAATCCTAGGGGCCGGCTGTTTTGTAGATGATATCGTAAAAACAGATGGTACACTTATGATTGAACGGTTCGTGTCTTTAGCTATAGATGCTTACCCACTTACTAAACATCCTAATCAGGAGTATGCTGATGTCTTTCATTTGTACTTACAATACATAAGAAAGCTACATGATGAGTTAACAGGACACATGTTAGACATGTATTCTGTTATGCTTACTAATGATAACACTTCAAGGTATTGGGAACCTGAGTTTTATGAGGCTATGTACACACCGCATACAGTCTTACAGGCTGTTGGGGCTTGTGTTCTTTGCAATTCACAGACTTCATTAAGATGTGGTGCTTGCATACGTAGACCATTCTTATGTTGTAAATGCTGTTACGACCATGTCATATCAACATCACATAAATTAGTCTTGTCTGTTAATCCGTATGTTTGCAATGCTCCAGGTTGTGATGTCACAGATGTGACTCAACTTTACTTAGGAGGTATGAGCTATTATTGTAAATCACATAAACCACCCATTAGTTTTCCATTGTGTGCTAATGGACAAGTTTTTGGTTTATATAAAAATACATGTGTTGGTAGCGATAATGTTACTGACTTTAATGCAATTGCAACATGTGACTGGACAAATGCTGGTGATTACATTTTAGCTAACACCTGTACTGAAAGACTCAAGCTTTTTGCAGCAGAAACGCTCAAAGCTACTGAGGAGACATTTAAACTGTCTTATGGTATTGCTACTGTACGTGAAGTGCTGTCTGACAGAGAATTACATCTTTCATGGGAAGTTGGTAAACCTAGACCACCACTTAACCGAAATTATGTCTTTACTGGTTATCGTGTAACTAAAAACAGTAAAGTACAAATAGGAGAGTACACCTTTGAAAAAGGTGACTATGGTGATGCTGTTGTTTACCGAGGTACAACAACTTACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCACATACAGTAATGCCATTAAGTGCACCTACACTAGTGCCACAAGAGCACTATGTTAGAATTACTGGCTTATACCCAACACTCAATATCTCAGATGAGTTTTCTAGCAATGTTGCAAATTATCAAAAGGTTGGTATGCAAAAGTATTCTACACTCCAGGGACCACCTGGTACTGGTAAGAGTCATTTTGCTATTGGCCTAGCTCTCTACTACCCTTCTGCTCGCATAGTGTATACAGCTTGCTCTCATGCCGCTGTTGATGCACTATGTGAGAAGGCATTAAAATATTTGCCTATAGATAAATGTAGTAGAATTATACCTGCACGTGCTCGTGTAGAGTGTTTTGATAAATTCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAAATTTCAATGGCCACAAATTATGATTTGAGTGTTGTCAATGCCAGATTACGTGCTAAGCACTATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTGCTGAAATTGTTGACACTGTGAGTGCTTTGGTTTATGATAATAAGCTTAAAGCACATAAAGACAAATCAGCTCAATGCTTTAAAATGTTTTATAAGGGTGTTATCACGCATGATGTTTCATCTGCAATTAACAGGCCACAAATAGGCGTGGTAAGAGAATTCCTTACACGTAACCCTGCTTGGAGAAAAGCTGTCTTTATTTCACCTTATAATTCACAGAATGCTGTAGCCTCAAAGATTTTGGGACTACCAACTCAAACTGTTGATTCATCACAGGGCTCAGAATATGACTATGTCATATTCACTCAAACCACTGAAACAGCTCACTCTTGTAATGTAAACAGATTTAATGTTGCTATTACCAGAGCAAAAGTAGGCATACTTTGCATAATGTCTGATAGAGACCTTTATGACAAGTTGCAATTTACAAGTCTTGAAATTCCACGTAGGAATGTGGCAACTTTACAAGCTGAAAATGTAACAGGACTCTTTAAAGATTGTAGTAAGGTAATCACTGGGTTACATCCTACACAGGCACCTACACACCTCAGTGTTGACACTAAATTCAAAACTGAAGGTTTATGTGTTGACATACCTGGCATACCTAAGGACATGACCTATAGAAGACTCATCTCTATGATGGGTTTTAAAATGAATTATCAAGTTAATGGTTACCCTAACATGTTTATCACCCGCGAAGAAGCTATAAGACATGTACGTGCATGGATTGGCTTCGATGTCGAGGGGTGTCATGCTACTAGAGAAGCTGTTGGTACCAATTTACCTTTACAGCTAGGTTTTTCTACAGGTGTTAACCTAGTTGCTGTACCTACAGGTTATGTTGATACACCTAATAATACAGATTTTTCCAGAGTTAGTGCTAAACCACCGCCTGGAGATCAATTTAAACACCTCATACCACTTATGTACAAAGGACTTCCTTGGAATGTAGTGCGTATAAAGATTGTACAAATGTTAAGTGACACACTTAAAAATCTCTCTGACAGAGTCGTATTTGTCTTATGGGCACATGGCTTTGAGTTGACATCTATGAAGTATTTTGTGAAAATAGGACCTGAGCGCACCTGTTGTCTATGTGATAGACGTGCCACATGCTTTTCCACTGCTTCAGACACTTATGCCTGTTGGCATCATTCTATTGGATTTGATTACGTCTATAATCCGTTTATGATTGATGTTCAACAATGGGGTTTTACAGGTAACCTACAAAGCAACCATGATCTGTATTGTCAAGTCCATGGTAATGCACATGTAGCTAGTTGTGATGCAATCATGACTAGGTGTCTAGCTGTCCACGAGTGCTTTGTTAAGCGTGTTGACTGGACTATTGAATATCCTATAATTGGTGATGAACTGAAGATTAATGCGGCTTGTAGAAAGGTTCAACACATGGTTGTTAAAGCTGCATTATTAGCAGACAAATTCCCAGTTCTTCACGACATTGGTAACCCTAAAGCTATTAAGTGTGTACCTCAAGCTGATGTAGAATGGAAGTTCTATGATGCACAGCCTTGTAGTGACAAAGCTTATAAAATAGAAGAATTATTCTATTCTTATGCCACACATTCTGACAAATTCACAGATGGTGTATGCCTATTTTGGAATTGCAATGTCGATAGATATCCTGCTAATTCCATTGTTTGTAGATTTGACACTAGAGTGCTATCTAACCTTAACTTGCCTGGTTGTGATGGTGGCAGTTTGTATGTAAATAAACATGCATTCCACACACCAGCTTTTGATAAAAGTGCTTTTGTTAATTTAAAACAATTACCATTTTTCTATTACTCTGACAGTCCATGTGAGTCTCATGGAAAACAAGTAGTGTCAGATATAGATTATGTACCACTAAAGTCTGCTACGTGTATAACACGTTGCAATTTAGGTGGTGCTGTCTGTAGACATCATGCTAATGAGTACAGATTGTATCTCGATGCTTATAACATGATGATCTCAGCTGGCTTTAGCTTGTGGGTTTACAAACAATTTGATACTTATAACCTCTGGAACACTTTTACAAGACTTCAGAGTTTAGAAAATGTGGCTTTTAATGTTGTAAATAAGGGACACTTTGATGGACAACAGGGTGAAGTACCAGTTTCTATCATTAATAACACTGTTTACACAAAAGTTGATGGTGTTGATGTAGAATTGTTTGAAAATAAAACAACATTACCTGTTAATGTAGCATTTGAGCTTTGGGCTAAGCGCAACATTAAACCAGTACCAGAGGTGAAAATACTCAATAATTTGGGTGTGGACATTGCTGCTAATACTGTGATCTGGGACTACAAAAGAGATGCTCCAGCACATATATCTACTATTGGTGTTTGTTCTATGACTGACATAGCCAAGAAACCAACTGAAACGATTTGTGCACCACTCACTGTCTTTTTTGATGGTAGAGTTGATGGTCAAGTAGACTTATTTAGAAATGCCCGTAATGGTGTTCTTATTACAGAAGGTAGTGTTAAAGGTTTACAACCATCTGTAGGTCCCAAACAAGCTAGTCTTAATGGAGTCACATTAATTGGAGAAGCCGTAAAAACACAGTTCAATTATTATAAGAAAGTTGATGGTGTTGTCCAACAATTACCTGAAACTTACTTTACTCAGAGTAGAAATTTACAAGAATTTAAACCCAGGAGTCAAATGGAAATTGATTTCTTAGAATTAGCTATGGATGAATTCATTGAACGGTATAAATTAGAAGGCTATGCCTTCGAACATATCGTTTATGGAGATTTTAGTCATAGTCAGTTAGGTGGTTTACATCTACTGATTGGACTAGCTAAACGTTTTAAGGAATCACCTTTTGAATTAGAAGATTTTATTCCTATGGACAGTACAGTTAAAAACTATTTCATAACAGATGCGCAAACAGGTTCATCTAAGTGTGTGTGTTCTGTTATTGATTTATTACTTGATGATTTTGTTGAAATAATAAAATCCCAAGATTTATCTGTAGTTTCTAAGGTTGTCAAAGTGACTATTGACTATACAGAAATTTCATTTATGCTTTGGTGTAAAGATGGCCATGTAGAAACATTTTACCCAAAATTACAATCTAGTCAAGCGTGGCAACCGGGTGTTGCTATGCCTAATCTTTACAAAATGCAAAGAATGCTATTAGAAAAGTGTGACCTTCAAAATTATGGTGATAGTGCAACATTACCTAAAGGCATAATGATGAATGTCGCAAAATATACTCAACTGTGTCAATATTTAAACACATTAACATTAGCTGTACCCTATAATATGAGAGTTATACATTTTGGTGCTGGTTCTGATAAAGGAGTTGCACCAGGTACAGCTGTTTTAAGACAGTGGTTGCCTACGGGTACGCTGCTTGTCGATTCAGATCTTAATGACTTTGTCTCTGATGCAGATTCAACTTTGATTGGTGATTGTGCAACTGTACATACAGCTAATAAATGGGATCTCATTATTAGTGATATGTACGACCCTAAGACTAAAAATGTTACAAAAGAAAATGACTCTAAAGAGGGTTTTTTCACTTACATTTGTGGGTTTATACAACAAAAGCTAGCTCTTGGAGGTTCCGTGGCTATAAAGATAACAGAACATTCTTGGAATGCTGATCTTTATAAGCTCATGGGACACTTCGCATGGTGGACAGCCTTTGTTACTAATGTGAATGCGTCATCATCTGAAGCATTTTTAATTGGATGTAATTATCTTGGCAAACCACGCGAACAAATAGATGGTTATGTCATGCATGCAAATTACATATTTTGGAGGAATACAAATCCAATTCAGTTGTCTTCCTATTCTTTATTTGACATGAGTAAATTTCCCCTTAAATTAAGGGGTACTGCTGTTATGTCTTTAAAAGAAGGTCAAATCAATGATATGATTTTATCTCTTCTTAGTAAAGGTAGACTTATAATTAGAGAAAACAACAGAGTTGTTATTTCTAGTGATGTTCTTGTTAACAACTAAACGAACAATGTTTGTTTTTCTTGTTTTATTGCCACTAGTCTCTAGTCAGTGTGTTAATCTTACAACCAGAACTCAATTACCCCCTGCATACACTAATTCTTTCACACGTGGTGTTTATTACCCTGACAAAGTTTTCAGATCCTCAGTTTTACATTCAACTCAGGACTTGTTCTTACCTTTCTTTTCCAATGTTACTTGGTTCCATGCTATACATGTCTCTGGGACCAATGGTACTAAGAGGTTTGATAACCCTGTCCTACCATTTAATGATGGTGTTTATTTTGCTTCCACTGAGAAGTCTAACATAATAAGAGGCTGGATTTTTGGTACTACTTTAGATTCGAAGACCCAGTCCCTACTTATTGTTAATAACGCTACTAATGTTGTTATTAAAGTCTGTGAATTTCAATTTTGTAATGATCCATTTTTGGGTGTTTATTACCACAAAAACAACAAAAGTTGGATGGAAAGTGAGTTCAGAGTTTATTCTAGTGCGAATAATTGCACTTTTGAATATGTCTCTCAGCCTTTTCTTATGGACCTTGAAGGAAAACAGGGTAATTTCAAAAATCTTAGGGAATTTGTGTTTAAGAATATTGATGGTTATTTTAAAATATATTCTAAGCACACGCCTATTAATTTAGTGCGTGATCTCCCTCAGGGTTTTTCGGCTTTAGAACCATTGGTAGATTTGCCAATAGGTATTAACATCACTAGGTTTCAAACTTTACTTGCTTTACATAGAAGTTATTTGACTCCTGGTGATTCTTCTTCAGGTTGGACAGCTGGTGCTGCAGCTTATTATGTGGGTTATCTTCAACCTAGGACTTTTCTATTAAAATATAATGAAAATGGAACCATTACAGATGCTGTAGACTGTGCACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACTGTAGAAAAAGGAATCTATCAAACTTCTAACTTTAGAGTCCAACCAACAGAATCTATTGTTAGATTTCCTAATATTACAAACTTGTGCCCTTTTGGTGAAGTTTTTAACGCCACCAGATTTGCATCTGTTTATGCTTGGAACAGGAAGAGAATCAGCAACTGTGTTGCTGATTATTCTGTCCTATATAATTCCGCATCATTTTCCACTTTTAAGTGTTATGGAGTGTCTCCTACTAAATTAAATGATCTCTGCTTTACTAATGTCTATGCAGATTCATTTGTAATTAGAGGTGATGAAGTCAGACAAATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAAATTACCAGATGATTTTACAGGCTGCGTTATAGCTTGGAATTCTAACAATCTTGATTCTAAGGTTGGTGGTAATTATAATTACCTGTATAGATTGTTTAGGAAGTCTAATCTCAAACCTTTTGAGAGAGATATTTCAACTGAAATCTATCAGGCCGGTAGCACACCTTGTAATGGTGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATATGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTTGAACTTCTACATGCACCAGCAACTGTTTGTGGACCTAAAAAGTCTACTAATTTGGTTAAAAACAAATGTGTCAATTTCAACTTCAATGGTTTAACAGGCACAGGTGTTCTTACTGAGTCTAACAAAAAGTTTCTGCCTTTCCAACAATTTGGCAGAGACATTGCTGACACTACTGATGCTGTCCGTGATCCACAGACACTTGAGATTCTTGACATTACACCATGTTCTTTTGGTGGTGTCAGTGTTATAACACCAGGAACAAATACTTCTAACCAGGTTGCTGTTCTTTATCAGGATGTTAACTGCACAGAAGTCCCTGTTGCTATTCATGCAGATCAACTTACTCCTACTTGGCGTGTTTATTCTACAGGTTCTAATGTTTTTCAAACACGTGCAGGCTGTTTAATAGGGGCTGAACATGTCAACAACTCATATGAGTGTGACATACCCATTGGTGCAGGTATATGCGCTAGTTATCAGACTCAGACTAATTCTCCTCGGCGGGCACGTAGTGTAGCTAGTCAATCCATCATTGCCTACACTATGTCACTTGGTGCAGAAAATTCAGTTGCTTACTCTAATAACTCTATTGCCATACCCACAAATTTTACTATTAGTGTTACCACAGAAATTCTACCAGTGTCTATGACCAAGACATCAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTTTTGTTGCAATATGGCAGTTTTTGTACACAATTAAACCGTGCTTTAACTGGAATAGCTGTTGAACAAGACAAAAACACCCAAGAAGTTTTTGCACAAGTCAAACAAATTTACAAAACACCACCAATTAAAGATTTTGGTGGTTTTAATTTTTCACAAATATTACCAGATCCATCAAAACCAAGCAAGAGGTCATTTATTGAAGATCTACTTTTCAACAAAGTGACACTTGCAGATGCTGGCTTCATCAAACAATATGGTGATTGCCTTGGTGATATTGCTGCTAGAGACCTCATTTGTGCACAAAAGTTTAACGGCCTTACTGTTTTGCCACCTTTGCTCACAGATGAAATGATTGCTCAATACACTTCTGCACTGTTAGCGGGTACAATCACTTCTGGTTGGACCTTTGGTGCAGGTGCTGCATTACAAATACCATTTGCTATGCAAATGGCTTATAGGTTTAATGGTATTGGAGTTACACAGAATGTTCTCTATGAGAACCAAAAATTGATTGCCAACCAATTTAATAGTGCTATTGGCAAAATTCAAGACTCACTTTCTTCCACAGCAAGTGCACTTGGAAAACTTCAAGATGTGGTCAACCAAAATGCACAAGCTTTAAACACGCTTGTTAAACAACTTAGCTCCAATTTTGGTGCAATTTCAAGTGTTTTAAATGATATCCTTTCACGTCTTGACAAAGTTGAGGCTGAAGTGCAAATTGATAGGTTGATCACAGGCAGACTTCAAAGTTTGCAGACATATGTGACTCAACAATTAATTAGAGCTGCAGAAATCAGAGCTTCTGCTAATCTTGCTGCTACTAAAATGTCAGAGTGTGTACTTGGACAATCAAAAAGAGTTGATTTTTGTGGAAAGGGCTATCATCTTATGTCCTTCCCTCAGTCAGCACCTCATGGTGTAGTCTTCTTGCATGTGACTTATGTCCCTGCACAAGAAAAGAACTTCACAACTGCTCCTGCCATTTGTCATGATGGAAAAGCACACTTTCCTCGTGAAGGTGTCTTTGTTTCAAATGGCACACACTGGTTTGTAACACAAAGGAATTTTTATGAACCACAAATCATTACTACAGACAACACATTTGTGTCTGGTAACTGTGATGTTGTAATAGGAATTGTCAACAACACAGTTTATGATCCTTTGCAACCTGAATTAGACTCATTCAAGGAGGAGTTAGATAAATATTTTAAGAATCATACATCACCAGATGTTGATTTAGGTGACATCTCTGGCATTAATGCTTCAGTTGTAAACATTCAAAAAGAAATTGACCGCCTCAATGAGGTTGCCAAGAATTTAAATGAATCTCTCATCGATCTCCAAGAACTTGGAAAGTATGAGCAGTATATAAAATGGCCATGGTACATTTGGCTAGGTTTTATAGCTGGCTTGATTGCCATAGTAATGGTGACAATTATGCTTTGCTGTATGACCAGTTGCTGTAGTTGTCTCAAGGGCTGTTGTTCTTGTGGATCCTGCTGCAAATTTGATGAAGACGACTCTGAGCCAGTGCTCAAAGGAGTCAAATTACATTACACATAAACGAACTTATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGAAGCAAGGTGAAATCAAGGATGCTACTCCTTCAGATTTTGTTCGCGCTACTGCAACGATACCGATACAAGCCTCACTCCCTTTCGGATGGCTTATTGTTGGCGTTGCACTTCTTGCTGTTTTTCAGAGCGCTTCCAAAATCATAACCCTCAAAAAGAGATGGCAACTAGCACTCTCCAAGGGTGTTCACTTTGTTTGCAACTTGCTGTTGTTGTTTGTAACAGTTTACTCACACCTTTTGCTCGTTGCTGCTGGCCTTGAAGCCCCTTTTCTCTATCTTTATGCTTTAGTCTACTTCTTGCAGAGTATAAACTTTGTAAGAATAATAATGAGGCTTTGGCTTTGCTGGAAATGCCGTTCCAAAAACCCATTACTTTATGATGCCAACTATTTTCTTTGCTGGCATACTAATTGTTACGACTATTGTATACCTTACAATAGTGTAACTTCTTCAATTGTCATTACTTCAGGTGATGGCACAACAAGTCCTATTTCTGAACATGACTACCAGATTGGTGGTTATACTGAAAAATGGGAATCTGGAGTAAAAGACTGTGTTGTATTACACAGTTACTTCACTTCAGACTATTACCAGCTGTACTCAACTCAATTGAGTACAGACACTGGTGTTGAACATGTTACCTTCTTCATCTACAATAAAATTGTTGATGAGCCTGAAGAACATGTCCAAATTCACACAATCGACGGTTCATCCGGAGTTGTTAATCCAGTAATGGAACCAATTTATGATGAACCGACGACGACTACTAGCGTGCCTTTGTAAGCACAAGCTGATGAGTACGAACTTATGTACTCATTCGTTTCGGAAGAGACAGGTACGTTAATAGTTAATAGCGTACTTCTTTTTCTTGCTTTCGTGGTATTCTTGCTAGTTACACTAGCCATCCTTACTGCGCTTCGATTGTGTGCGTACTGCTGCAATATTGTTAACGTGAGTCTTGTAAAACCTTCTTTTTACGTTTACTCTCGTGTTAAAAATCTGAATTCTTCTAGAGTTCCTGATCTTCTGGTCTAAACGAACTAAATATTATATTAGTTTTTCTGTTTGGAACTTTAATTTTAGCCATGGCAGATTCCAACGGTACTATTACCGTTGAAGAGCTTAAAAAGCTCCTTGAACAATGGAACCTAGTAATAGGTTTCCTATTCCTTACATGGATTTGTCTTCTACAATTTGCCTATGCCAACAGGAATAGGTTTTTGTATATAATTAAGTTAATTTTCCTCTGGCTGTTATGGCCAGTAACTTTAGCTTGTTTTGTGCTTGCTGCTGTTTACAGAATAAATTGGATCACCGGTGGAATTGCTATCGCAATGGCTTGTCTTGTAGGCTTGATGTGGCTCAGCTACTTCATTGCTTCTTTCAGACTGTTTGCGCGTACGCGTTCCATGTGGTCATTCAATCCAGAAACTAACATTCTTCTCAACGTGCCACTCCATGGCACTATTCTGACCAGACCGCTTCTAGAAAGTGAACTCGTAATCGGAGCTGTGATCCTTCGTGGACATCTTCGTATTGCTGGACACCATCTAGGACGCTGTGACATCAAGGACCTGCCTAAAGAAATCACTGTTGCTACATCACGAACGCTTTCTTATTACAAATTGGGAGCTTCGCAGCGTGTAGCAGGTGACTCAGGTTTTGCTGCATACAGTCGCTACAGGATTGGCAACTATAAATTAAACACAGACCATTCCAGTAGCAGTGACAATATTGCTTTGCTTGTACAGTAAGTGACAACAGATGTTTCATCTCGTTGACTTTCAGGTTACTATAGCAGAGATATTACTAATTATTATGAGGACTTTTAAAGTTTCCATTTGGAATCTTGATTACATCATAAACCTCATAATTAAAAATTTATCTAAGTCACTAACTGAGAATAAATATTCTCAATTAGATGAAGAGCAACCAATGGAGATTGATTAAACGAACATGAAAATTATTCTTTTCTTGGCACTGATAACACTCGCTACTTGTGAGCTTTATCACTACCAAGAGTGTGTTAGAGGTACAACAGTACTTTTAAAAGAACCTTGCTCTTCTGGAACATACGAGGGCAATTCACCATTTCATCCTCTAGCTGATAACAAATTTGCACTGACTTGCTTTAGCACTCAATTTGCTTTTGCTTGTCCTGACGGCGTAAAACACGTCTATCAGTTACGTGCCAGATCAGTTTCACCTAAACTGTTCATCAGACAAGAGGAAGTTCAAGAACTTTACTCTCCAATTTTTCTTATTGTTGCGGCAATAGTGTTTATAACACTTTGCTTCACACTCAAAAGAAAGACAGAATGATTGAACTTTCATTAATTGACTTCTATTTGTGCTTTTTAGCCTTTCTGCTATTCCTTGTTTTAATTATGCTTATTATCTTTTGGTTCTCACTTGAACTGCAAGATCATAATGAAACTTGTCACGCCTAAACGAACATGAAATTTCTTGTTTTCTTAGGAATCATCACAACTGTAGCTGCATTTCACCAAGAATGTAGTTTACAGTCATGTACTCAACATCAACCATATGTAGTTGATGACCCGTGTCCTATTCACTTCTATTCTAAATGGTATATTAGAGTAGGAGCTAGAAAATCAGCACCTTTAATTGAATTGTGCGTGGATGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGGTAATTATACAGTTTCCTGTTCACCTTTTACAATTAATTGCCAGGAACCTAAATTGGGTAGTCTTGTAGTGCGTTGTTCGTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTTGTTTTAGATTTCATCTAAACGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAGCGAAATGCACCCCGCATTACGTTTGGTGGACCCTCAGATTCAACTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAACGTCGGCCCCAAGGTTTACCCAATAATACTGCGTCTTGGTTCACCGCTCTCACTCAACATGGCAAGGAAGACCTTAAATTCCCTCGAGGACAAGGCGTTCCAATTAACACCAATAGCAGTCCAGATGACCAAATTGGCTACTACCGAAGAGCTACCAGACGAATTCGTGGTGGTGACGGTAAAATGAAAGATCTCAGTCCAAGATGGTATTTCTACTACCTAGGAACTGGGCCAGAAGCTGGACTTCCCTATGGTGCTAACAAAGACGGCATCATATGGGTTGCAACTGAGGGAGCCTTGAATACACCAAAAGATCACATTGGCACCCGCAATCCTGCTAACAATGCTGCAATCGTGCTACAACTTCCTCAAGGAACAACATTGCCAAAAGGCTTCTACGCAGAAGGGAGCAGAGGCGGCAGTCAAGCCTCTTCTCGTTCCTCATCACGTAGTCGCAACAGTTCAAGAAATTCAACTCCAGGCAGCAGTAGGGGAACTTCTCCTGCTAGAATGGCTGGCAATGGCGGTGATGCTGCTCTTGCTTTGCTGCTGCTTGACAGATTGAACCAGCTTGAGAGCAAAATGTCTGGTAAAGGCCAACAACAACAAGGCCAAACTGTCACTAAGAAATCTGCTGCTGAGGCTTCTAAGAAGCCTCGGCAAAAACGTACTGCCACTAAAGCATACAATGTAACACAAGCTTTCGGCAGACGTGGTCCAGAACAAACCCAAGGAAATTTTGGGGACCAGGAACTAATCAGACAAGGAACTGATTACAAACATTGGCCGCAAATTGCACAATTTGCCCCCAGCGCTTCAGCGTTCTTCGGAATGTCGCGCATTGGCATGGAAGTCACACCTTCGGGAACGTGGTTGACCTACACAGGTGCCATCAAATTGGATGACAAAGATCCAAATTTCAAAGATCAAGTCATTTTGCTGAATAAGCATATTGACGCATACAAAACATTCCCACCAACAGAGCCTAAAAAGGACAAAAAGAAGAAGGCTGATGAAACTCAAGCCTTACCGCAGAGACAGAAGAAACAGCAAACTGTGACTCTTCTTCCTGCTGCAGATTTGGATGATTTCTCCAAACAATTGCAACAATCCATGAGCAGTGCTGACTCAACTCAGGCCTAAACTCATGCAGACCACACAAGGCAGATGGGCTATATAAACGTTTTCGCTTTTCCGTTTACGATATATAGTCTACTCTTGTGCAGAATGAATTCTCGTAACTACATAGCACAAGTAGATGTAGTTAACTTTAATCTCACATAGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
\ No newline at end of file
=====================================
pangoLEARN/training/utils.py
=====================================
@@ -0,0 +1,61 @@
+from Bio import SeqIO
+import os
+import csv
+import sys
+
+import collections
+import hashlib
+import collections
+import csv
+
+def version_from_init(init_file):
+ version=None
+ with open(init_file, "r") as fr:
+ for l in fr:
+ if l.startswith("__version__"):
+ l = l.rstrip("\n")
+ version = l.split('=')[1]
+ version = version.replace('"',"").replace(" ","")
+ break
+ return version
+
+def get_pango_version(pango_path):
+ version =""
+
+ for r,d,f in os.walk(pango_path):
+ for fn in f:
+ if fn == "__init__.py":
+ version = version_from_init(os.path.join(r, fn))
+ if not version:
+ continue
+ print("Pango version is:", version)
+
+ if not version:
+ sys.sterr.write("No version found at pango path")
+ sys.exit(-1)
+ else:
+ return version
+
+def get_hash_string(record):
+ seq = str(record.seq).upper().encode()
+ hash_object = hashlib.md5(seq)
+ hash_string = hash_object.hexdigest()
+ return hash_string
+
+def get_dict(in_csv,name_column,data_column):
+ this_dict = {}
+ with open(in_csv,"r") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ this_dict[row[name_column]] = row[data_column]
+ return this_dict
+
+def add_to_hash(seq_file):
+ hash_map = {}
+ seq_hash = {}
+ for record in SeqIO.parse(seq_file, "fasta"):
+ seq = str(record.seq).upper().encode()
+ hash_object = hashlib.md5(seq)
+ hash_map[hash_object.hexdigest()] = record.id
+ seq_hash[str(record.seq)] = record.id
+ return hash_map,seq_hash
\ No newline at end of file
View it on GitLab: https://salsa.debian.org/med-team/python-pangolearn/-/compare/a75aa7790b1f037d8812b5ed27d1d5a16e28e797...2fdb284a42dd7989b09ff177672e9d858f4d72fc
--
View it on GitLab: https://salsa.debian.org/med-team/python-pangolearn/-/compare/a75aa7790b1f037d8812b5ed27d1d5a16e28e797...2fdb284a42dd7989b09ff177672e9d858f4d72fc
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20221223/d8fe9d90/attachment-0001.htm>
More information about the debian-med-commit
mailing list