From a92f852ec6fa3e2467f4050750886ec5bb16548d Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Tue, 27 Sep 2022 14:55:53 +0200 Subject: [PATCH 1/5] method to derive class --- deeprank/models/variant.py | 1 + deeprank/tools/metrics.py | 81 +++++++++----------------------------- test/tools/test_metrics.py | 16 ++++++++ 3 files changed, 36 insertions(+), 62 deletions(-) create mode 100644 test/tools/test_metrics.py diff --git a/deeprank/models/variant.py b/deeprank/models/variant.py index ee5839a..fbe127b 100644 --- a/deeprank/models/variant.py +++ b/deeprank/models/variant.py @@ -5,6 +5,7 @@ class VariantClass(Enum): + UNKNOWN = -1 BENIGN = 0 PATHOGENIC = 1 diff --git a/deeprank/tools/metrics.py b/deeprank/tools/metrics.py index 5a4968d..c8dc6fe 100644 --- a/deeprank/tools/metrics.py +++ b/deeprank/tools/metrics.py @@ -1,74 +1,31 @@ -from math import sqrt +from typing import List, Optional +import torch -def get_tp_tn_fp_fn(output_data, target_data): - """ A classification metric +from deeprank.models.variant import VariantClass - Args: - output_data(array of dimension (x,2)): considered negative if left value > right value - target_data(array of dimension (x,1)): considered negative if 0, positive otherewise - Returns (four floats): - true positive count (tp) - true negative count (tn) - false positive count (fp) - false negative count (fn) +def get_labels_from_probabilities(probabilities: torch.tensor, + unknown_treshold: Optional[float] = 0.5) -> List[VariantClass]: + """ + Args: + probabilities: [x, 2] considered negative if left value > right value + unknown_treshold: if the values are both below this value, then consider the class UNKNOWN """ - tp = 0 - tn = 0 - fp = 0 - fn = 0 - - total = output_data.shape[0] - if total == 0: - raise ValueError("0 output data entries") + total = probabilities.shape[0] + labels = [] for index in range(total): - output0, output1 = output_data[index,:] - target = target_data[index] - - if output0 > output1: # negative output - - if target != 0: # wrong - - fn += 1 - - else: # right - - tn += 1 - - else: # positive output + if probabilities[index, 0] < unknown_treshold and probabilities[index, 1] < unknown_treshold: + label = VariantClass.UNKNOWN - if target != 0: # right - - tp += 1 - - else: # wrong - - fp += 1 - - return tp, tn, fp, fn - - -def get_mcc(tp, tn, fp, fn): - """ The Mathews Correlation Coefficient - - Args: - tp (float): true positive count - tn (float): true negative count - fp (float): false positive count - fn (float): false negative count - - Returns (float): Mathews Correlation Coefficient - """ + elif probabilities[index, 0] < probabilities[index, 1]: + label = VariantClass.PATHOGENIC - numerator = tp * tn - fp * fn - if numerator == 0: - return 0.0 + else: + label = VariantClass.BENIGN - denominator = sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) - if denominator == 0: - raise ValueError(f"MCC denominator is zero for tp={tp}, tn={tn}, fp={fp}, fn={fn}") + labels.append(label) - return numerator / denominator + return labels diff --git a/test/tools/test_metrics.py b/test/tools/test_metrics.py new file mode 100644 index 0000000..8918bdd --- /dev/null +++ b/test/tools/test_metrics.py @@ -0,0 +1,16 @@ +import torch + +from deeprank.models.variant import VariantClass +from deeprank.tools.metrics import get_labels_from_probabilities + + +def test_labels_from_probabililties(): + data = torch.tensor([[0.1, -0.1], + [0.0, 1.1], + [1.2, -0.1]]) + + labels = get_labels_from_probabilities(data) + + assert labels == [VariantClass.UNKNOWN, + VariantClass.PATHOGENIC, + VariantClass.BENIGN], f"labels are {labels}" From 4c5ae4f6d14505e6f6da260740ff51a948d8d3a7 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Tue, 27 Sep 2022 16:01:04 +0200 Subject: [PATCH 2/5] it works --- deeprank/models/metrics.py | 66 +++++++++++++++++++++++++------------ deeprank/tools/metrics.py | 32 ++++++++++++++---- scripts/learn.py | 4 +-- test/models/test_metrics.py | 4 +-- test/test_learn.py | 4 +-- test/tools/test_metrics.py | 18 ++++++++-- 6 files changed, 92 insertions(+), 36 deletions(-) diff --git a/deeprank/models/metrics.py b/deeprank/models/metrics.py index 6574b61..f11d444 100644 --- a/deeprank/models/metrics.py +++ b/deeprank/models/metrics.py @@ -1,7 +1,7 @@ import lzma import os import csv -from typing import List, Tuple, Any +from typing import List, Tuple, Any, Optional from math import sqrt from torch import argmax, tensor @@ -9,6 +9,9 @@ from torch.utils.tensorboard import SummaryWriter from sklearn.metrics import roc_auc_score +from deeprank.models.variant import VariantClass +from deeprank.tools.metrics import get_labels_from_output, get_labels_from_targets + class MetricsExporter: "The class implements an object, to be called when a neural network generates output" @@ -49,12 +52,19 @@ def process(self, pass_name: str, epoch_number: int, metrics_exporter.process(pass_name, epoch_number, entry_names, output_values, target_values) -class TensorboardBinaryClassificationExporter(MetricsExporter): - "exports to tensorboard, works for binary classification only" +class TensorboardVariantClassificationExporter(MetricsExporter): + "exports to tensorboard, works for variant classification only" + + def __init__(self, directory_path: str, unknown_treshold: Optional[float] = 0.5): + """ + Args: + directory_path: where to store tensorboard files + unknown_treshold: if both class probabilities are below this value, give the class label UNKNOWN + """ - def __init__(self, directory_path): self._directory_path = directory_path self._writer = SummaryWriter(log_dir=directory_path) + self._unknown_treshold = unknown_treshold def __enter__(self): self._writer.__enter__() @@ -67,33 +77,45 @@ def process(self, pass_name: str, epoch_number: int, entry_names: List[str], output_values: List[Any], target_values: List[Any]): "write to tensorboard" - loss = cross_entropy(tensor(output_values), tensor(target_values)) + output_tensor = tensor(output_values) + target_tensor = tensor(target_values) + + loss = cross_entropy(output_tensor, target_tensor) self._writer.add_scalar(f"{pass_name} loss", loss, epoch_number) - probabilities = [] + # lists of VariantClass values + prediction_labels = get_labels_from_output(output_tensor, unknown_treshold=self._unknown_treshold) + target_labels = get_labels_from_targets(target_tensor) + + roc_probabilities = [] # floating point values + roc_targets = [] # list of 0/1 values + fp, fn, tp, tn = 0, 0, 0, 0 for entry_index, entry_name in enumerate(entry_names): - probability = output_values[entry_index][1] - probabilities.append(probability) - - prediction_value = argmax(tensor(output_values[entry_index])) - target_value = target_values[entry_index] + prediction_label = prediction_labels[entry_index] + target_label = target_labels[entry_index] - if prediction_value > 0.0 and target_value > 0.0: + if prediction_label == VariantClass.PATHOGENIC and target_label == VariantClass.PATHOGENIC: tp += 1 - elif prediction_value <= 0.0 and target_value <= 0.0: + elif prediction_label == VariantClass.BENIGN and target_label == VariantClass.BENIGN: tn += 1 - elif prediction_value > 0.0 and target_value <= 0.0: + elif prediction_label == VariantClass.PATHOGENIC and target_label == VariantClass.BENIGN: fp += 1 - elif prediction_value <= 0.0 and target_value > 0.0: + elif prediction_label == VariantClass.BENIGN and target_label == VariantClass.PATHOGENIC: fn += 1 + if prediction_label != VariantClass.UNKNOWN: + roc_probabilities.append(output_values[entry_index][1]) + roc_targets.append(target_values[entry_index]) + + # Furthermore, UNKNOWN variants are completely ignored.. + mcc_numerator = tn * tp - fp * fn if mcc_numerator == 0.0: - self._writer.add_scalar(f"{pass_name} MCC", 0.0, epoch_number) + self._writer.add_scalar(f"{pass_name} MCC", 0.0, epoch_number) else: mcc_denominator = sqrt((tn + fn) * (fp + tp) * (tn + fp) * (fn + tp)) @@ -101,12 +123,14 @@ def process(self, pass_name: str, epoch_number: int, mcc = mcc_numerator / mcc_denominator self._writer.add_scalar(f"{pass_name} MCC", mcc, epoch_number) - accuracy = (tp + tn) / (tp + tn + fp + fn) - self._writer.add_scalar(f"{pass_name} accuracy", accuracy, epoch_number) + accuracy_denominator = tp + tn + fp + fn + if accuracy_denominator > 0: + accuracy = (tp + tn) / accuracy_denominator + self._writer.add_scalar(f"{pass_name} accuracy", accuracy, epoch_number) - # for ROC curves to work, we need both class values in the set - if len(set(target_values)) == 2: - roc_auc = roc_auc_score(target_values, probabilities) + # for ROC curves to work, we need both class values in the target set + if len(set(roc_targets)) >= 2: + roc_auc = roc_auc_score(roc_targets, roc_probabilities) self._writer.add_scalar(f"{pass_name} ROC AUC", roc_auc, epoch_number) diff --git a/deeprank/tools/metrics.py b/deeprank/tools/metrics.py index c8dc6fe..598fddb 100644 --- a/deeprank/tools/metrics.py +++ b/deeprank/tools/metrics.py @@ -5,22 +5,22 @@ from deeprank.models.variant import VariantClass -def get_labels_from_probabilities(probabilities: torch.tensor, - unknown_treshold: Optional[float] = 0.5) -> List[VariantClass]: +def get_labels_from_output(output_data: torch.Tensor, + unknown_treshold: Optional[float] = 0.5) -> List[VariantClass]: """ Args: - probabilities: [x, 2] considered negative if left value > right value + output_data: [x, 2] considered BENIGN if left value > right value and otherwise PATHOGENIC unknown_treshold: if the values are both below this value, then consider the class UNKNOWN """ - total = probabilities.shape[0] + total = output_data.shape[0] labels = [] for index in range(total): - if probabilities[index, 0] < unknown_treshold and probabilities[index, 1] < unknown_treshold: + if output_data[index, 0] < unknown_treshold and output_data[index, 1] < unknown_treshold: label = VariantClass.UNKNOWN - elif probabilities[index, 0] < probabilities[index, 1]: + elif output_data[index, 0] < output_data[index, 1]: label = VariantClass.PATHOGENIC else: @@ -29,3 +29,23 @@ def get_labels_from_probabilities(probabilities: torch.tensor, labels.append(label) return labels + + +def get_labels_from_targets(target_data: torch.Tensor) -> List[VariantClass]: + """ + Args: + target_data: [x, 1] where 0 means BENIGN and 1 means PATHOGENIC + """ + + total = target_data.shape[0] + + labels = [] + for index in range(total): + if target_data[index] > 0: + label = VariantClass.PATHOGENIC + else: + label = VariantClass.BENIGN + + labels.append(label) + + return labels diff --git a/scripts/learn.py b/scripts/learn.py index 3daea7c..80c5939 100755 --- a/scripts/learn.py +++ b/scripts/learn.py @@ -14,7 +14,7 @@ from deeprank.learn.DataSet import DataSet from deeprank.learn.model3d import cnn_class from deeprank.models.metrics import OutputExporter -from deeprank.models.metrics import TensorboardBinaryClassificationExporter +from deeprank.models.metrics import TensorboardVariantClassificationExporter logging.basicConfig(filename="learn-%d.log" % os.getpid(), filemode="w", level=logging.INFO) @@ -80,6 +80,6 @@ def interpret_args(args, usage): neural_net = NeuralNet(dataset, cnn_class, model_type='3d',task='class', cuda=False, metrics_exporters=[OutputExporter(run_directory), - TensorboardBinaryClassificationExporter(run_directory)]) + TensorboardVariantClassificationExporter(run_directory)]) neural_net.optimizer = optim.AdamW(neural_net.net.parameters(), lr=0.001, weight_decay=0.005) neural_net.train(nepoch = epoch_count, divide_trainset=None, train_batch_size = 5, num_workers=0) diff --git a/test/models/test_metrics.py b/test/models/test_metrics.py index a311a7f..f1226d5 100644 --- a/test/models/test_metrics.py +++ b/test/models/test_metrics.py @@ -2,7 +2,7 @@ import shutil import os -from deeprank.models.metrics import TensorboardBinaryClassificationExporter, OutputExporter +from deeprank.models.metrics import TensorboardVariantClassificationExporter, OutputExporter test_entries = ["entry0", "entry1"] @@ -14,7 +14,7 @@ def test_tensorboard_class_output(): tmp_dir_path = tempfile.mkdtemp() try: - exporter = TensorboardBinaryClassificationExporter(tmp_dir_path) + exporter = TensorboardVariantClassificationExporter(tmp_dir_path) with exporter: exporter.process("unit-testing", 0, test_entries, test_outputs, test_targets) diff --git a/test/test_learn.py b/test/test_learn.py index 6dbd776..44c5b27 100644 --- a/test/test_learn.py +++ b/test/test_learn.py @@ -14,7 +14,7 @@ from deeprank.learn.model3d import cnn_class from deeprank.models.environment import Environment from deeprank.domain.amino_acid import * -from deeprank.models.metrics import OutputExporter, TensorboardBinaryClassificationExporter +from deeprank.models.metrics import OutputExporter, TensorboardVariantClassificationExporter import deeprank.config @@ -81,7 +81,7 @@ def test_learn(): neural_net = NeuralNet(dataset, cnn_class, model_type='3d',task='class', cuda=False, metrics_exporters=[OutputExporter(metrics_directory), - TensorboardBinaryClassificationExporter(metrics_directory)]) + TensorboardVariantClassificationExporter(metrics_directory)]) neural_net.optimizer = optim.SGD(neural_net.net.parameters(), lr=0.001, diff --git a/test/tools/test_metrics.py b/test/tools/test_metrics.py index 8918bdd..e4af5fc 100644 --- a/test/tools/test_metrics.py +++ b/test/tools/test_metrics.py @@ -1,16 +1,28 @@ import torch from deeprank.models.variant import VariantClass -from deeprank.tools.metrics import get_labels_from_probabilities +from deeprank.tools.metrics import get_labels_from_output, get_labels_from_targets -def test_labels_from_probabililties(): +def test_labels_from_output(): data = torch.tensor([[0.1, -0.1], [0.0, 1.1], [1.2, -0.1]]) - labels = get_labels_from_probabilities(data) + labels = get_labels_from_output(data) assert labels == [VariantClass.UNKNOWN, VariantClass.PATHOGENIC, VariantClass.BENIGN], f"labels are {labels}" + + +def get_labels_from_targets(): + data = torch.tensor([[0], [1], [1], [0], [0]]) + + labels = get_labels_from_targets(data) + + assert labels == [VariantClass.BENIGN, + VariantClass.PATHOGENIC, + VariantClass.PATHOGENIC, + VariantClass.BENIGN, + VariantClass.BENIGN], f"labels are {labels}" From 34be7ec9b3e1f49f43b862550211c929c1d01092 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Tue, 27 Sep 2022 16:25:56 +0200 Subject: [PATCH 3/5] small modification: the difference determines whether something is unknown --- deeprank/models/metrics.py | 2 +- deeprank/tools/metrics.py | 4 ++-- test/tools/{test_metrics.py => test_metrics_tools.py} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename test/tools/{test_metrics.py => test_metrics_tools.py} (100%) diff --git a/deeprank/models/metrics.py b/deeprank/models/metrics.py index f11d444..5acd7c2 100644 --- a/deeprank/models/metrics.py +++ b/deeprank/models/metrics.py @@ -59,7 +59,7 @@ def __init__(self, directory_path: str, unknown_treshold: Optional[float] = 0.5) """ Args: directory_path: where to store tensorboard files - unknown_treshold: if both class probabilities are below this value, give the class label UNKNOWN + unknown_treshold: if the absolute difference between the class probabilities is below this value, give the class label UNKNOWN """ self._directory_path = directory_path diff --git a/deeprank/tools/metrics.py b/deeprank/tools/metrics.py index 598fddb..3c81426 100644 --- a/deeprank/tools/metrics.py +++ b/deeprank/tools/metrics.py @@ -10,14 +10,14 @@ def get_labels_from_output(output_data: torch.Tensor, """ Args: output_data: [x, 2] considered BENIGN if left value > right value and otherwise PATHOGENIC - unknown_treshold: if the values are both below this value, then consider the class UNKNOWN + unknown_treshold: if the absolute difference between the values is below this value, then consider the class UNKNOWN """ total = output_data.shape[0] labels = [] for index in range(total): - if output_data[index, 0] < unknown_treshold and output_data[index, 1] < unknown_treshold: + if abs(output_data[index, 0] - output_data[index, 1]) < unknown_treshold: label = VariantClass.UNKNOWN elif output_data[index, 0] < output_data[index, 1]: diff --git a/test/tools/test_metrics.py b/test/tools/test_metrics_tools.py similarity index 100% rename from test/tools/test_metrics.py rename to test/tools/test_metrics_tools.py From 7a55fd6686f2e69fda8059ddcb5b34d72b7a4187 Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Wed, 28 Sep 2022 12:05:40 +0200 Subject: [PATCH 4/5] add a label table --- deeprank/models/metrics.py | 34 ++++++++++++++++++++++++++++++++++ scripts/learn.py | 5 ++--- test/models/test_metrics.py | 16 +++++++++++++++- 3 files changed, 51 insertions(+), 4 deletions(-) diff --git a/deeprank/models/metrics.py b/deeprank/models/metrics.py index 5acd7c2..7a6a89f 100644 --- a/deeprank/models/metrics.py +++ b/deeprank/models/metrics.py @@ -163,3 +163,37 @@ def process(self, pass_name: str, epoch_number: int, target_value = target_values[entry_index] w.writerow([entry_name, str(output_value), str(target_value)]) + + +class LabelExporter(MetricsExporter): + "writes all labels to a table" + + def __init__(self, directory_path, unknown_treshold: Optional[float] = 0.5): + self._directory_path = directory_path + self._unknown_treshold = unknown_treshold + + def get_filename(self, pass_name, epoch_number): + "returns the filename for the table" + return os.path.join(self._directory_path, f"labels-{pass_name}-epoch-{epoch_number}.csv.xz") + + def process(self, pass_name: str, epoch_number: int, + entry_names: List[str], output_values: List[Any], target_values: List[Any]): + "write the output to the table" + + # lists of VariantClass values + output_labels = get_labels_from_output(tensor(output_values), unknown_treshold=self._unknown_treshold) + target_labels = get_labels_from_targets(tensor(target_values)) + + if not os.path.isdir(self._directory_path): + os.mkdir(self._directory_path) + + with lzma.open(self.get_filename(pass_name, epoch_number), 'wt') as f: + w = csv.writer(f) + + w.writerow(["entry", "output_label", "target_label"]) + + for entry_index, entry_name in enumerate(entry_names): + output_label = output_labels[entry_index] + target_label = target_labels[entry_index] + + w.writerow([entry_name, output_label, target_label]) diff --git a/scripts/learn.py b/scripts/learn.py index 80c5939..c934328 100755 --- a/scripts/learn.py +++ b/scripts/learn.py @@ -13,8 +13,7 @@ from deeprank.learn.NeuralNet import NeuralNet from deeprank.learn.DataSet import DataSet from deeprank.learn.model3d import cnn_class -from deeprank.models.metrics import OutputExporter -from deeprank.models.metrics import TensorboardVariantClassificationExporter +from deeprank.models.metrics import OutputExporter, LabelExporter, TensorboardVariantClassificationExporter logging.basicConfig(filename="learn-%d.log" % os.getpid(), filemode="w", level=logging.INFO) @@ -79,7 +78,7 @@ def interpret_args(args, usage): run_directory = "run-{}".format(os.getpid()) neural_net = NeuralNet(dataset, cnn_class, model_type='3d',task='class', cuda=False, - metrics_exporters=[OutputExporter(run_directory), + metrics_exporters=[OutputExporter(run_directory), LabelExporter(run_directory), TensorboardVariantClassificationExporter(run_directory)]) neural_net.optimizer = optim.AdamW(neural_net.net.parameters(), lr=0.001, weight_decay=0.005) neural_net.train(nepoch = epoch_count, divide_trainset=None, train_batch_size = 5, num_workers=0) diff --git a/test/models/test_metrics.py b/test/models/test_metrics.py index f1226d5..f0a957e 100644 --- a/test/models/test_metrics.py +++ b/test/models/test_metrics.py @@ -2,7 +2,7 @@ import shutil import os -from deeprank.models.metrics import TensorboardVariantClassificationExporter, OutputExporter +from deeprank.models.metrics import TensorboardVariantClassificationExporter, OutputExporter, LabelExporter test_entries = ["entry0", "entry1"] @@ -36,3 +36,17 @@ def test_output(): assert len(os.listdir(tmp_dir_path)) > 0, "output directory is empty" finally: shutil.rmtree(tmp_dir_path) + + +def test_label(): + tmp_dir_path = tempfile.mkdtemp() + try: + + exporter = LabelExporter(tmp_dir_path) + + with exporter: + exporter.process("unit-testing", 0, test_entries, test_outputs, test_targets) + + assert len(os.listdir(tmp_dir_path)) > 0, "output directory is empty" + finally: + shutil.rmtree(tmp_dir_path) From 9432496146d70e5f9a6a7630f04778a7dd6781ce Mon Sep 17 00:00:00 2001 From: Coos Baakman Date: Thu, 29 Sep 2022 10:47:06 +0200 Subject: [PATCH 5/5] write the label's names instead --- deeprank/models/metrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeprank/models/metrics.py b/deeprank/models/metrics.py index 7a6a89f..6a21bb4 100644 --- a/deeprank/models/metrics.py +++ b/deeprank/models/metrics.py @@ -193,7 +193,7 @@ def process(self, pass_name: str, epoch_number: int, w.writerow(["entry", "output_label", "target_label"]) for entry_index, entry_name in enumerate(entry_names): - output_label = output_labels[entry_index] - target_label = target_labels[entry_index] + output_name = output_labels[entry_index].name + target_name = target_labels[entry_index].name - w.writerow([entry_name, output_label, target_label]) + w.writerow([entry_name, output_name, target_name])