full upload so not to lose anything important
This commit is contained in:
@@ -1,12 +1,17 @@
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.ensemble import IsolationForest
|
||||
from sklearn.metrics import roc_auc_score
|
||||
from sklearn.metrics import (
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
)
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from networks.main import build_autoencoder
|
||||
|
||||
@@ -22,7 +27,7 @@ class IsoForest(object):
|
||||
contamination=0.1,
|
||||
n_jobs=-1,
|
||||
seed=None,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
"""Init Isolation Forest instance."""
|
||||
self.n_estimators = n_estimators
|
||||
@@ -37,7 +42,7 @@ class IsoForest(object):
|
||||
contamination=contamination,
|
||||
n_jobs=n_jobs,
|
||||
random_state=seed,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.hybrid = hybrid
|
||||
@@ -47,28 +52,44 @@ class IsoForest(object):
|
||||
"train_time": None,
|
||||
"test_time": None,
|
||||
"test_auc": None,
|
||||
"test_roc": None,
|
||||
"test_scores": None,
|
||||
}
|
||||
|
||||
def train(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Trains the Isolation Forest model on the training data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
# do not drop last batch for non-SGD optimization shallow_ssad
|
||||
train_loader = DataLoader(
|
||||
dataset=dataset.train_set,
|
||||
batch_size=128,
|
||||
shuffle=True,
|
||||
num_workers=n_jobs_dataloader,
|
||||
drop_last=False,
|
||||
)
|
||||
# drop_last necessary?
|
||||
# train_loader = DataLoader(
|
||||
# dataset=dataset.train_set,
|
||||
# batch_size=128,
|
||||
# shuffle=True,
|
||||
# num_workers=n_jobs_dataloader,
|
||||
# drop_last=False,
|
||||
# )
|
||||
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=128,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
# Get data from loader
|
||||
X = ()
|
||||
for data in train_loader:
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, _ = data
|
||||
inputs = inputs.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -91,14 +112,25 @@ class IsoForest(object):
|
||||
logger.info("Finished training.")
|
||||
|
||||
def test(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
):
|
||||
"""Tests the Isolation Forest model on the test data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=128,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
idx_label_score = []
|
||||
@@ -106,7 +138,7 @@ class IsoForest(object):
|
||||
idxs = []
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, idx = data
|
||||
inputs, label_batch, _, idx, _ = data
|
||||
inputs, label_batch, idx = (
|
||||
inputs.to(device),
|
||||
label_batch.to(device),
|
||||
@@ -140,6 +172,9 @@ class IsoForest(object):
|
||||
labels = np.array(labels)
|
||||
scores = np.array(scores)
|
||||
self.results["test_auc"] = roc_auc_score(labels, scores)
|
||||
self.results["test_roc"] = roc_curve(labels, scores)
|
||||
self.results["test_prc"] = precision_recall_curve(labels, scores)
|
||||
self.results["test_ap"] = average_precision_score(labels, scores)
|
||||
|
||||
# Log results
|
||||
logger.info("Test AUC: {:.2f}%".format(100.0 * self.results["test_auc"]))
|
||||
@@ -178,7 +213,8 @@ class IsoForest(object):
|
||||
"""Load Isolation Forest model from import_path."""
|
||||
pass
|
||||
|
||||
def save_results(self, export_json):
|
||||
def save_results(self, export_pkl):
|
||||
"""Save results dict to a JSON-file."""
|
||||
with open(export_json, "w") as fp:
|
||||
json.dump(self.results, fp)
|
||||
with open(export_pkl, "wb") as fp:
|
||||
# json.dump(self.results, fp)
|
||||
pickle.dump(self.results, fp)
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
import json
|
||||
import logging
|
||||
import pickle
|
||||
import time
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from torch.utils.data import DataLoader
|
||||
from sklearn.svm import OneClassSVM
|
||||
from sklearn.metrics import roc_auc_score
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.metrics import (
|
||||
average_precision_score,
|
||||
precision_recall_curve,
|
||||
roc_auc_score,
|
||||
roc_curve,
|
||||
)
|
||||
from thundersvm import OneClassSVM
|
||||
|
||||
from base.base_dataset import BaseADDataset
|
||||
from networks.main import build_autoencoder
|
||||
|
||||
@@ -21,7 +27,7 @@ class OCSVM(object):
|
||||
self.rho = None
|
||||
self.gamma = None
|
||||
|
||||
self.model = OneClassSVM(kernel=kernel, nu=nu)
|
||||
self.model = OneClassSVM(kernel=kernel, nu=nu, verbose=True, max_mem_size=4048)
|
||||
|
||||
self.hybrid = hybrid
|
||||
self.ae_net = None # autoencoder network for the case of a hybrid model
|
||||
@@ -40,24 +46,31 @@ class OCSVM(object):
|
||||
}
|
||||
|
||||
def train(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
batch_size: int = 32,
|
||||
):
|
||||
"""Trains the OC-SVM model on the training data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
# do not drop last batch for non-SGD optimization shallow_ssad
|
||||
train_loader = DataLoader(
|
||||
dataset=dataset.train_set,
|
||||
batch_size=128,
|
||||
shuffle=True,
|
||||
num_workers=n_jobs_dataloader,
|
||||
drop_last=False,
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
train_loader, _ = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
train_loader, _, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
X = ()
|
||||
for data in train_loader:
|
||||
inputs, _, _, _ = data
|
||||
inputs, _, _, _, _ = data
|
||||
inputs = inputs.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -77,14 +90,21 @@ class OCSVM(object):
|
||||
best_auc = 0.0
|
||||
|
||||
# Sample hold-out set from test set
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
X_test = ()
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, _ = data
|
||||
inputs, label_batch, _, _, _ = data
|
||||
inputs, label_batch = inputs.to(device), label_batch.to(device)
|
||||
if self.hybrid:
|
||||
inputs = self.ae_net.encoder(
|
||||
@@ -102,8 +122,9 @@ class OCSVM(object):
|
||||
np.sum(labels == 1),
|
||||
)
|
||||
n_val = int(0.1 * n_test)
|
||||
n_val_normal, n_val_outlier = int(n_val * (n_normal / n_test)), int(
|
||||
n_val * (n_outlier / n_test)
|
||||
n_val_normal, n_val_outlier = (
|
||||
int(n_val * (n_normal / n_test)),
|
||||
int(n_val * (n_outlier / n_test)),
|
||||
)
|
||||
perm = np.random.permutation(n_test)
|
||||
X_val = np.concatenate(
|
||||
@@ -116,9 +137,14 @@ class OCSVM(object):
|
||||
|
||||
i = 1
|
||||
for gamma in gammas:
|
||||
|
||||
# Model candidate
|
||||
model = OneClassSVM(kernel=self.kernel, nu=self.nu, gamma=gamma)
|
||||
model = OneClassSVM(
|
||||
kernel=self.kernel,
|
||||
nu=self.nu,
|
||||
gamma=gamma,
|
||||
verbose=True,
|
||||
max_mem_size=4048,
|
||||
)
|
||||
|
||||
# Train
|
||||
start_time = time.time()
|
||||
@@ -147,7 +173,9 @@ class OCSVM(object):
|
||||
|
||||
# If hybrid, also train a model with linear kernel
|
||||
if self.hybrid:
|
||||
self.linear_model = OneClassSVM(kernel="linear", nu=self.nu)
|
||||
self.linear_model = OneClassSVM(
|
||||
kernel="linear", nu=self.nu, max_mem_size=4048
|
||||
)
|
||||
start_time = time.time()
|
||||
self.linear_model.fit(X)
|
||||
train_time = time.time() - start_time
|
||||
@@ -160,14 +188,26 @@ class OCSVM(object):
|
||||
logger.info("Finished training.")
|
||||
|
||||
def test(
|
||||
self, dataset: BaseADDataset, device: str = "cpu", n_jobs_dataloader: int = 0
|
||||
self,
|
||||
dataset: BaseADDataset,
|
||||
device: str = "cpu",
|
||||
n_jobs_dataloader: int = 0,
|
||||
k_fold_idx: int = None,
|
||||
batch_size: int = 32,
|
||||
):
|
||||
"""Tests the OC-SVM model on the test data."""
|
||||
logger = logging.getLogger()
|
||||
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=128, num_workers=n_jobs_dataloader
|
||||
)
|
||||
if k_fold_idx is not None:
|
||||
_, test_loader = dataset.loaders_k_fold(
|
||||
fold_idx=k_fold_idx,
|
||||
batch_size=batch_size,
|
||||
num_workers=n_jobs_dataloader,
|
||||
)
|
||||
else:
|
||||
_, test_loader, _ = dataset.loaders(
|
||||
batch_size=batch_size, num_workers=n_jobs_dataloader
|
||||
)
|
||||
|
||||
# Get data from loader
|
||||
idx_label_score = []
|
||||
@@ -175,7 +215,7 @@ class OCSVM(object):
|
||||
idxs = []
|
||||
labels = []
|
||||
for data in test_loader:
|
||||
inputs, label_batch, _, idx = data
|
||||
inputs, label_batch, _, idx, _ = data
|
||||
inputs, label_batch, idx = (
|
||||
inputs.to(device),
|
||||
label_batch.to(device),
|
||||
@@ -212,6 +252,9 @@ class OCSVM(object):
|
||||
labels = np.array(labels)
|
||||
scores = np.array(scores)
|
||||
self.results["test_auc"] = roc_auc_score(labels, scores)
|
||||
self.results["test_roc"] = roc_curve(labels, scores)
|
||||
self.results["test_prc"] = precision_recall_curve(labels, scores)
|
||||
self.results["test_ap"] = average_precision_score(labels, scores)
|
||||
|
||||
# If hybrid, also test model with linear kernel
|
||||
if self.hybrid:
|
||||
@@ -268,7 +311,7 @@ class OCSVM(object):
|
||||
"""Load OC-SVM model from import_path."""
|
||||
pass
|
||||
|
||||
def save_results(self, export_json):
|
||||
"""Save results dict to a JSON-file."""
|
||||
with open(export_json, "w") as fp:
|
||||
json.dump(self.results, fp)
|
||||
def save_results(self, export_pkl):
|
||||
with open(export_pkl, "wb") as fp:
|
||||
# json.dump(self.results, fp)
|
||||
pickle.dump(self.results, fp)
|
||||
|
||||
Reference in New Issue
Block a user