full upload so not to lose anything important

This commit is contained in:
Jan Kowalczyk
2025-03-14 18:02:23 +01:00
parent 35fcfb7d5a
commit b824ff7482
33 changed files with 3539 additions and 353 deletions

View File

@@ -1,12 +1,17 @@
import json
import logging
import pickle
import time
import torch
import numpy as np
from torch.utils.data import DataLoader
import numpy as np
import torch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from sklearn.metrics import (
average_precision_score,
precision_recall_curve,
roc_auc_score,
roc_curve,
)
from base.base_dataset import BaseADDataset
from networks.main import build_autoencoder
@@ -22,7 +27,7 @@ class IsoForest(object):
contamination=0.1,
n_jobs=-1,
seed=None,
**kwargs
**kwargs,
):
"""Init Isolation Forest instance."""
self.n_estimators = n_estimators
@@ -37,7 +42,7 @@ class IsoForest(object):
contamination=contamination,
n_jobs=n_jobs,
random_state=seed,
**kwargs
**kwargs,
)
self.hybrid = hybrid
@@ -47,28 +52,44 @@ class IsoForest(object):
"train_time": None,
"test_time": None,
"test_auc": None,
"test_roc": None,
"test_scores": None,
}
def train(
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
self,
dataset: BaseADDataset,
device: str = "cpu",
n_jobs_dataloader: int = 0,
k_fold_idx: int = None,
):
"""Trains the Isolation Forest model on the training data."""
logger = logging.getLogger()
# do not drop last batch for non-SGD optimization shallow_ssad
train_loader = DataLoader(
dataset=dataset.train_set,
batch_size=128,
shuffle=True,
num_workers=n_jobs_dataloader,
drop_last=False,
)
# drop_last necessary?
# train_loader = DataLoader(
# dataset=dataset.train_set,
# batch_size=128,
# shuffle=True,
# num_workers=n_jobs_dataloader,
# drop_last=False,
# )
if k_fold_idx is not None:
train_loader, _ = dataset.loaders_k_fold(
fold_idx=k_fold_idx,
batch_size=128,
num_workers=n_jobs_dataloader,
)
else:
train_loader, _, _ = dataset.loaders(
batch_size=128, num_workers=n_jobs_dataloader
)
# Get data from loader
X = ()
for data in train_loader:
inputs, _, _, _ = data
inputs, _, _, _, _ = data
inputs = inputs.to(device)
if self.hybrid:
inputs = self.ae_net.encoder(
@@ -91,14 +112,25 @@ class IsoForest(object):
logger.info("Finished training.")
def test(
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
self,
dataset: BaseADDataset,
device: str = "cpu",
n_jobs_dataloader: int = 0,
k_fold_idx: int = None,
):
"""Tests the Isolation Forest model on the test data."""
logger = logging.getLogger()
_, test_loader, _ = dataset.loaders(
batch_size=128, num_workers=n_jobs_dataloader
)
if k_fold_idx is not None:
_, test_loader = dataset.loaders_k_fold(
fold_idx=k_fold_idx,
batch_size=128,
num_workers=n_jobs_dataloader,
)
else:
_, test_loader, _ = dataset.loaders(
batch_size=128, num_workers=n_jobs_dataloader
)
# Get data from loader
idx_label_score = []
@@ -106,7 +138,7 @@ class IsoForest(object):
idxs = []
labels = []
for data in test_loader:
inputs, label_batch, _, idx = data
inputs, label_batch, _, idx, _ = data
inputs, label_batch, idx = (
inputs.to(device),
label_batch.to(device),
@@ -140,6 +172,9 @@ class IsoForest(object):
labels = np.array(labels)
scores = np.array(scores)
self.results["test_auc"] = roc_auc_score(labels, scores)
self.results["test_roc"] = roc_curve(labels, scores)
self.results["test_prc"] = precision_recall_curve(labels, scores)
self.results["test_ap"] = average_precision_score(labels, scores)
# Log results
logger.info("Test AUC: {:.2f}%".format(100.0 * self.results["test_auc"]))
@@ -178,7 +213,8 @@ class IsoForest(object):
"""Load Isolation Forest model from import_path."""
pass
def save_results(self, export_json):
def save_results(self, export_pkl):
"""Save results dict to a JSON-file."""
with open(export_json, "w") as fp:
json.dump(self.results, fp)
with open(export_pkl, "wb") as fp:
# json.dump(self.results, fp)
pickle.dump(self.results, fp)

View File

@@ -1,12 +1,18 @@
import json
import logging
import pickle
import time
import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score
import numpy as np
import torch
from sklearn.metrics import (
average_precision_score,
precision_recall_curve,
roc_auc_score,
roc_curve,
)
from thundersvm import OneClassSVM
from base.base_dataset import BaseADDataset
from networks.main import build_autoencoder
@@ -21,7 +27,7 @@ class OCSVM(object):
self.rho = None
self.gamma = None
self.model = OneClassSVM(kernel=kernel, nu=nu)
self.model = OneClassSVM(kernel=kernel, nu=nu, verbose=True, max_mem_size=4048)
self.hybrid = hybrid
self.ae_net = None # autoencoder network for the case of a hybrid model
@@ -40,24 +46,31 @@ class OCSVM(object):
}
def train(
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
self,
dataset: BaseADDataset,
device: str = "cpu",
n_jobs_dataloader: int = 0,
k_fold_idx: int = None,
batch_size: int = 32,
):
"""Trains the OC-SVM model on the training data."""
logger = logging.getLogger()
# do not drop last batch for non-SGD optimization shallow_ssad
train_loader = DataLoader(
dataset=dataset.train_set,
batch_size=128,
shuffle=True,
num_workers=n_jobs_dataloader,
drop_last=False,
)
if k_fold_idx is not None:
train_loader, _ = dataset.loaders_k_fold(
fold_idx=k_fold_idx,
batch_size=batch_size,
num_workers=n_jobs_dataloader,
)
else:
train_loader, _, _ = dataset.loaders(
batch_size=batch_size, num_workers=n_jobs_dataloader
)
# Get data from loader
X = ()
for data in train_loader:
inputs, _, _, _ = data
inputs, _, _, _, _ = data
inputs = inputs.to(device)
if self.hybrid:
inputs = self.ae_net.encoder(
@@ -77,14 +90,21 @@ class OCSVM(object):
best_auc = 0.0
# Sample hold-out set from test set
_, test_loader, _ = dataset.loaders(
batch_size=128, num_workers=n_jobs_dataloader
)
if k_fold_idx is not None:
_, test_loader = dataset.loaders_k_fold(
fold_idx=k_fold_idx,
batch_size=batch_size,
num_workers=n_jobs_dataloader,
)
else:
_, test_loader, _ = dataset.loaders(
batch_size=batch_size, num_workers=n_jobs_dataloader
)
X_test = ()
labels = []
for data in test_loader:
inputs, label_batch, _, _ = data
inputs, label_batch, _, _, _ = data
inputs, label_batch = inputs.to(device), label_batch.to(device)
if self.hybrid:
inputs = self.ae_net.encoder(
@@ -102,8 +122,9 @@ class OCSVM(object):
np.sum(labels == 1),
)
n_val = int(0.1 * n_test)
n_val_normal, n_val_outlier = int(n_val * (n_normal / n_test)), int(
n_val * (n_outlier / n_test)
n_val_normal, n_val_outlier = (
int(n_val * (n_normal / n_test)),
int(n_val * (n_outlier / n_test)),
)
perm = np.random.permutation(n_test)
X_val = np.concatenate(
@@ -116,9 +137,14 @@ class OCSVM(object):
i = 1
for gamma in gammas:
# Model candidate
model = OneClassSVM(kernel=self.kernel, nu=self.nu, gamma=gamma)
model = OneClassSVM(
kernel=self.kernel,
nu=self.nu,
gamma=gamma,
verbose=True,
max_mem_size=4048,
)
# Train
start_time = time.time()
@@ -147,7 +173,9 @@ class OCSVM(object):
# If hybrid, also train a model with linear kernel
if self.hybrid:
self.linear_model = OneClassSVM(kernel="linear", nu=self.nu)
self.linear_model = OneClassSVM(
kernel="linear", nu=self.nu, max_mem_size=4048
)
start_time = time.time()
self.linear_model.fit(X)
train_time = time.time() - start_time
@@ -160,14 +188,26 @@ class OCSVM(object):
logger.info("Finished training.")
def test(
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
self,
dataset: BaseADDataset,
device: str = "cpu",
n_jobs_dataloader: int = 0,
k_fold_idx: int = None,
batch_size: int = 32,
):
"""Tests the OC-SVM model on the test data."""
logger = logging.getLogger()
_, test_loader, _ = dataset.loaders(
batch_size=128, num_workers=n_jobs_dataloader
)
if k_fold_idx is not None:
_, test_loader = dataset.loaders_k_fold(
fold_idx=k_fold_idx,
batch_size=batch_size,
num_workers=n_jobs_dataloader,
)
else:
_, test_loader, _ = dataset.loaders(
batch_size=batch_size, num_workers=n_jobs_dataloader
)
# Get data from loader
idx_label_score = []
@@ -175,7 +215,7 @@ class OCSVM(object):
idxs = []
labels = []
for data in test_loader:
inputs, label_batch, _, idx = data
inputs, label_batch, _, idx, _ = data
inputs, label_batch, idx = (
inputs.to(device),
label_batch.to(device),
@@ -212,6 +252,9 @@ class OCSVM(object):
labels = np.array(labels)
scores = np.array(scores)
self.results["test_auc"] = roc_auc_score(labels, scores)
self.results["test_roc"] = roc_curve(labels, scores)
self.results["test_prc"] = precision_recall_curve(labels, scores)
self.results["test_ap"] = average_precision_score(labels, scores)
# If hybrid, also test model with linear kernel
if self.hybrid:
@@ -268,7 +311,7 @@ class OCSVM(object):
"""Load OC-SVM model from import_path."""
pass
def save_results(self, export_json):
"""Save results dict to a JSON-file."""
with open(export_json, "w") as fp:
json.dump(self.results, fp)
def save_results(self, export_pkl):
with open(export_pkl, "wb") as fp:
# json.dump(self.results, fp)
pickle.dump(self.results, fp)