diff --git a/Quartet_Accuracy_Evaluation.ipynb b/Quartet_Accuracy_Evaluation.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0f4e6ee1affe72173a55b6cf6e64b105fd71f423 --- /dev/null +++ b/Quartet_Accuracy_Evaluation.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "import statistics\n", + "def analyze_big_tree_info(folder):\n", + " mp_rf = []\n", + " nj_rf = []\n", + " upgma_rf = []\n", + " for big_tree_path in glob.glob(folder+\"/**/*.big_tree_results.pickle\",recursive = True):\n", + " with open(big_tree_path,\"rb\") as f:\n", + " data = pickle.load(f)\n", + " \n", + " mp_rf.append(data[\"mp_rf\"])\n", + " nj_rf.append(data[\"nj_rf\"])\n", + " upgma_rf.append(data[\"upgma_rf\"])\n", + " return_tuple = (average(nj_rf),statistics.stdev(nj_rf),average(mp_rf),statistics.stdev(mp_rf),average(upgma_rf),statistics.stdev(upgma_rf))\n", + " return [i/34 for i in return_tuple]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'utilities'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-2-ef1af16ffbd1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtree_evaluation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mglob\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mete3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mTree\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/scratch/ga42vor/simulating_quartets/tree_evaluation.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpathlib\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mphylogeny_utilities\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mutilities\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstatistics\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'utilities'" + ] + } + ], + "source": [ + "import tree_evaluation\n", + "import glob\n", + "from pathlib import Path\n", + "from ete3 import Tree\n", + "import re\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def parse_ghost(ghost_string):\n", + "\n", + " ghost_string = re.sub(\n", + " r\"\\[.*?\\]\\:\\d*\\.\\d*\",\n", + " \"\",\n", + " ghost_string\n", + " )\n", + " return ghost_string\n", + "\n", + "\n", + "def quartet_accuracy_replicate(orig_tree_path):\n", + " orig_tree = Tree(orig_tree_path)\n", + " parent_path = str(Path(orig_tree_path).parents[0])\n", + " size = len(orig_tree.get_leaves())\n", + " result_dict = {\"orig_tree\":orig_tree}\n", + " \n", + " wag_path = parent_path + \"/quartet_files/*.wag.treefile\"\n", + " wag_files = glob.glob(wag_path)\n", + " if len(wag_files) != 4845:\n", + " raise ValueError(\"Incorrect number of wag files in path {}\".format(wag_path))\n", + " \n", + " ghost_path = parent_path + \"/quartet_files/*.ghost.treefile\"\n", + " ghost_files = glob.glob(ghost_path)\n", + " if len(ghost_files) != 4845:\n", + " raise ValueError(\"Incorrect number of ghost files in path {}\".format(ghost_path))\n", + " \n", + " \n", + " modelfinder_path = parent_path + \"/quartet_files/*.modelfinder.treefile\"\n", + " modelfinder_files = glob.glob(modelfinder_path)\n", + " if len(modelfinder_files) != 4845:\n", + " raise ValueError(\"Incorrect number of modelfinder files in path {}\".format(modelfinder_path))\n", + "\n", + " \n", + " mp_path = parent_path + \"/quartet_files/*.mp.treefile\"\n", + " mp_files = glob.glob(mp_path)\n", + " if len(mp_files) != 4845:\n", + " raise ValueError(\"Incorrect number of mp files in path {}\".format(mp_path))\n", + " \n", + " \n", + " nj_path = parent_path + \"/quartet_files/*.nj.treefile\"\n", + " nj_files = glob.glob(nj_path)\n", + " if len(nj_files) != 4845:\n", + " raise ValueError(\"Incorrect number of nj files in path {}\".format(nj_path))\n", + " \n", + " \n", + " \n", + " upgma_path = parent_path + \"/quartet_files/*.upgma.treefile\"\n", + " upgma_files = glob.glob(upgma_path)\n", + " if len(upgma_files) != 4845:\n", + " raise ValueError(\"Incorrect number of upgma files in path {}\".format(upgma_path))\n", + " \n", + "\n", + " \n", + " wag_trees = [Tree(p) for p in wag_files]\n", + " wag_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,wag_trees)\n", + " \n", + " \n", + " modelfinder_trees = [Tree(p) for p in modelfinder_files]\n", + " modelfinder_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,modelfinder_trees)\n", + " \n", + " \n", + "\n", + "\n", + " nj_trees = [Tree(p) for p in nj_files]\n", + " nj_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,nj_trees)\n", + " \n", + " upgma_trees = [Tree(p) for p in upgma_files]\n", + " upgma_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,upgma_trees)\n", + " \n", + " \n", + " ghost_trees = []\n", + " for path in ghost_files:\n", + " with open(path,\"r\") as f:\n", + " content = f.readline()\n", + " ghost_newick = parse_ghost(content)\n", + " ghost_trees.append(Tree(ghost_newick))\n", + " \n", + " \n", + " ghost_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,ghost_trees)\n", + " \n", + " \n", + " \n", + " mp_trees = []\n", + " for path in mp_files:\n", + " with open(path,\"r\") as f:\n", + " content = f.readline()\n", + " mp_newick = content\n", + " mp_trees.append(Tree(mp_newick))\n", + " mp_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,mp_trees)\n", + " \n", + " #ghost_trees = [Tree(p) for p in ghost_files] \n", + " result_dict[\"wag_quartet_accuracy\"] = wag_accuracy\n", + " result_dict[\"modelfinder_quartet_accuracy\"] = modelfinder_accuracy\n", + " result_dict[\"ghost_quartet_accuracy\"] = ghost_accuracy\n", + " result_dict[\"mp_quartet_accuracy\"] = mp_accuracy\n", + " result_dict[\"nj_quartet_accuracy\"] = nj_accuracy\n", + " result_dict[\"upgma_quartet_accuracy\"] = upgma_accuracy\n", + " \n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn1_588/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn1_correct = pickle.load(f)\n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn2_1272/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn2_correct = pickle.load(f) \n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn3_1098/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn3_correct = pickle.load(f)\n", + "\n", + " \n", + " result_dict[\"dnn1_quartet_accuracy\"] = dnn1_correct\n", + " result_dict[\"dnn2_quartet_accuracy\"] = dnn2_correct\n", + " result_dict[\"dnn3_quartet_accuracy\"] = dnn3_correct\n", + " \n", + " \n", + " with open(parent_path+\"/quartet_results.pickle\",\"wb\") as f:\n", + " pickle.dump(result_dict,f)\n", + " return result_dict\n", + " \n", + "\n", + "\n", + "def gather_quartets(superfolder):\n", + " for orig_tree_path in glob.glob(superfolder+\"/**/*.orig_tree\",recursive=True):\n", + " orig_tree = Tree(orig_tree_path)\n", + " parent_path = str(Path(orig_tree_path).parents[0])\n", + " size = len(orig_tree.get_leaves())\n", + " result_dict = {\"orig_tree\":orig_tree}\n", + " \n", + " wag_path = parent_path + \"/quartet_files/*.wag.treefile\"\n", + " wag_files = glob.glob(wag_path)\n", + " if len(wag_files) != 4845:\n", + " raise ValueError(\"Incorrect number of wag files in path {}\".format(wag_path))\n", + " \n", + " ghost_path = parent_path + \"/quartet_files/*.ghost.treefile\"\n", + " ghost_files = glob.glob(ghost_path)\n", + " if len(ghost_files) != 4845:\n", + " raise ValueError(\"Incorrect number of ghost files in path {}\".format(ghost_path))\n", + " \n", + " \n", + " modelfinder_path = parent_path + \"/quartet_files/*.modelfinder.treefile\"\n", + " modelfinder_files = glob.glob(modelfinder_path)\n", + " if len(modelfinder_files) != 4845:\n", + " raise ValueError(\"Incorrect number of modelfinder files in path {}\".format(modelfinder_path))\n", + " \n", + " \n", + " mp_path = parent_path + \"/quartet_files/*.mp.treefile\"\n", + " mp_files = glob.glob(mp_path)\n", + " if len(mp_files) != 4845:\n", + " raise ValueError(\"Incorrect number of mp files in path {}\".format(mp_path))\n", + " \n", + " \n", + " nj_path = parent_path + \"/quartet_files/*.nj.treefile\"\n", + " nj_files = glob.glob(nj_path)\n", + " if len(nj_files) != 4845:\n", + " raise ValueError(\"Incorrect number of nj files in path {}\".format(nj_path))\n", + " \n", + " \n", + " \n", + " upgma_path = parent_path + \"/quartet_files/*.upgma.treefile\"\n", + " upgma_files = glob.glob(upgma_path)\n", + " if len(upgma_files) != 4845:\n", + " raise ValueError(\"Incorrect number of upgma files in path {}\".format(upgma_path))\n", + " \n", + " \n", + " \n", + " wag_trees = [Tree(p) for p in wag_files]\n", + " wag_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,wag_trees)\n", + " \n", + " \n", + " modelfinder_trees = [Tree(p) for p in modelfinder_files]\n", + " modelfinder_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,modelfinder_trees)\n", + " \n", + " \n", + "\n", + " \n", + " nj_trees = [Tree(p) for p in nj_files]\n", + " nj_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,nj_trees)\n", + " \n", + " upgma_trees = [Tree(p) for p in upgma_files]\n", + " upgma_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,upgma_trees)\n", + " \n", + " \n", + " ghost_trees = []\n", + " for path in ghost_files:\n", + " with open(path,\"r\") as f:\n", + " content = f.readline()\n", + " ghost_newick = parse_ghost(content)\n", + " ghost_trees.append(Tree(ghost_newick))\n", + " \n", + " \n", + " ghost_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,ghost_trees)\n", + " \n", + " \n", + " \n", + " mp_trees = []\n", + " for path in mp_files:\n", + " with open(path,\"r\") as f:\n", + " content = f.readline()\n", + " mp_newick = content\n", + " mp_trees.append(Tree(mp_newick))\n", + " mp_accuracy = tree_evaluation.accuracy_of_tree_list_from_reference(orig_tree,mp_trees)\n", + " \n", + " #ghost_trees = [Tree(p) for p in ghost_files] \n", + " result_dict[\"wag_quartet_accuracy\"] = wag_accuracy\n", + " result_dict[\"modelfinder_quartet_accuracy\"] = modelfinder_accuracy\n", + " result_dict[\"ghost_quartet_accuracy\"] = ghost_accuracy\n", + " result_dict[\"mp_quartet_accuracy\"] = mp_accuracy\n", + " result_dict[\"nj_quartet_accuracy\"] = nj_accuracy\n", + " result_dict[\"upgma_quartet_accuracy\"] = upgma_accuracy\n", + " \n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn1_588/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn1_correct = pickle.load(f)\n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn2_1272/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn2_correct = pickle.load(f) \n", + " \n", + " with open(parent_path+\"/quartet_predictions_dnn3_1098/correct_proportion.pickle\",\"rb\") as f:\n", + " dnn3_correct = pickle.load(f)\n", + "\n", + " \n", + " result_dict[\"dnn1_quartet_accuracy\"] = dnn1_correct\n", + " result_dict[\"dnn2_quartet_accuracy\"] = dnn2_correct\n", + " result_dict[\"dnn3_quartet_accuracy\"] = dnn3_correct" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}