...
 
Commits (7)
...@@ -14,3 +14,9 @@ cythonModules/build/ ...@@ -14,3 +14,9 @@ cythonModules/build/
*.pyd *.pyd
*.html *.html
*.pkl
chemometrics/Assignments.txt
chemometrics/Data.txt
import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy import spatial
from itertools import combinations
from random import sample
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
try:
standardizedData = StandardScaler().fit_transform(data.copy())
except ValueError:
print('first standardscaler attempt failed, retrying..')
print('datashape', data.shape)
print('unique:', np.unique(data))
raise
pca = PCA(n_components=numComp)
princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
return princComp
def do_DBSCAN_clustering(data: np.ndarray, eps: float = 0.1, min_samples: int = 10) -> tuple:
"""
Does DBSCAN clustering and finds noisy data
:param data: The input array
:param eps:
:param min_samples:
:return: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
"""
assert data.shape[1] == 2
standardizedData = StandardScaler().fit_transform(data)
db = DBSCAN(eps=eps, min_samples=min_samples).fit(standardizedData)
return db.labels_, db.core_sample_indices_
def get_n_points_closest_to_point(points: np.ndarray, n: int, refPoint: np.ndarray) -> list:
"""
Returns a list with indices of n points that are closest to the indicated refPoint
:param points: np.ndarray, cols: x, y, rows: individual points
:param n: number of points to return
:param refPoint: np.array([x, y]) of reference point
:return: list of point indices
"""
distancesToPoints: np.ndarray = np.linalg.norm(points - refPoint, axis=1)
sortedIndices = np.argsort(distancesToPoints)
return list(sortedIndices[:n])
class ChemometricSubsampling(SubsamplingMethod):
def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)
@property
def label(self) -> str:
return 'Chemometric Selection'
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
try:
princComps: np.ndarray = get_pca(vectors)
except ValueError:
print('numParticles:', len(self.particleContainer.particles))
print('input featurematrix shape', vectors.shape)
clusterLabels, coreIndices = do_DBSCAN_clustering(princComps)
indices: list = self._get_indices_from_clusterLabels(princComps, clusterLabels, coreIndices)
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in indices:
selectedParticles.append(particle)
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray:
"""
:return: np.ndarray, numRows: Features, numCols: Particles
"""
vectors: list = []
for particle in self.particleContainer.particles:
extractor: FeatureExtractor = FeatureExtractor(particle)
vectors.append(extractor.get_characteristic_vector())
vectors: np.ndarray = np.transpose(np.array(vectors))
assert vectors.shape == (11, len(self.particleContainer.particles)), f'wrong featureMat-shape: {vectors.shape}'
return vectors
def equals(self, otherMethod) -> bool:
equals: bool = False
if type(otherMethod) == ChemometricSubsampling and otherMethod.fraction == self.fraction:
equals = True
return equals
def _get_indices_from_clusterLabels(self, points: np.ndarray, labels: np.ndarray, centerIndices: np.ndarray) -> list:
indices: list = []
allIndices: np.ndarray = np.arange(len(labels))
numPointsPerCluster: dict = self._get_numPoints_per_cluster(labels)
for clusterIndex in set(labels):
indToAppend: list = []
nPoints: int = int(numPointsPerCluster[clusterIndex])
indicesInCluster: np.ndarray = allIndices[labels == clusterIndex]
if clusterIndex == -1:
for ind in sample(list(indicesInCluster), nPoints):
# assert ind not in indices
indices.append(ind)
else:
clusterPoints: np.ndarray = points[indicesInCluster]
centerPoint: np.ndarray = np.mean(clusterPoints, axis=0)
indicesToSelect: list = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indicesToSelect:
origInd = indicesInCluster[ind]
indices.append(origInd)
assert len(set(indices)) == len(indices), f'The calculated indices contain duplicates, ' \
f'num duplicates: {len(indices) - len(set(indices))}'
return indices
def _get_numPoints_per_cluster(self, labels: np.ndarray, noiseAmpFactor: float = 5) -> dict:
"""
MP Particles are expected to be the minority of all particles. So, if datapoints were classified as noise
(i.e., label = -1), it is likely that MP is in there. The abundancy of points taken from the noise is multiplied
by the noiseAmpFactor
:param labels:
:param noiseAmpFactor:
:return: A dictionary with keys = cluster index (i.e., label) and value = number of points to take from that
"""
pointsPerCluster: dict = {}
if type(labels) != np.ndarray:
labels = np.array(labels)
individualLabels: set = set(labels)
numPointsToSelect = round(len(labels) * self.fraction)
if numPointsToSelect == 0:
numPointsToSelect = 1
numNoisePoints = len(labels[labels == -1])
numClusteredPoints = len(labels) - numNoisePoints
# # get max noiseAmpFactor
if noiseAmpFactor > 1/self.fraction:
noiseAmpFactor = 1/self.fraction
numAmpPoints = numClusteredPoints + numNoisePoints*noiseAmpFactor
fractionPerCluster = np.clip(numPointsToSelect / numAmpPoints, 0.0, 1.0)
tooFewPoints = numPointsToSelect < len(individualLabels)
totalPointsAdded = 0
for ind in individualLabels:
if ind > -1:
if not tooFewPoints:
pointsToAdd = round(fractionPerCluster * len(labels[labels == ind]))
else:
pointsToAdd = 1 if totalPointsAdded < numPointsToSelect else 0
pointsPerCluster[ind] = pointsToAdd
totalPointsAdded += pointsToAdd
# fill up the rest with noisePoints
if numNoisePoints > 0:
diff: float = np.clip(numPointsToSelect - totalPointsAdded, 0, numNoisePoints)
pointsPerCluster[-1] = diff
totalPointsAdded += diff
# just in case too many points were selected (due to rounding errors), keep on deleting until it matches
while totalPointsAdded > numPointsToSelect:
indexWithHighestCount = None
maxCount = 0
for index in pointsPerCluster.values():
if pointsPerCluster[index] > maxCount:
maxCount = pointsPerCluster[index]
indexWithHighestCount = index
pointsPerCluster[indexWithHighestCount] -= 1
totalPointsAdded -= 1
if not abs(totalPointsAdded - numPointsToSelect) <= 1:
print('error')
assert abs(totalPointsAdded - numPointsToSelect) <= 1
for clusterIndex in pointsPerCluster.keys():
assert 0 <= pointsPerCluster[clusterIndex] <= len(labels[labels == clusterIndex])
return pointsPerCluster
class FeatureExtractor(object):
def __init__(self, particle: pm.Particle):
super(FeatureExtractor, self).__init__()
self.particle: pm.Particle = particle
def get_characteristic_vector(self) -> np.ndarray:
log_hu: np.ndarray = self._get_log_hu_moments()
color: np.ndarray = self._get_color_hash(self.particle.color, desiredLength=4)
vector: np.ndarray = np.hstack((log_hu, color))
if len(vector) != 11:
print('error')
assert len(vector) == 7 + 4, f'wrong feature vector: {vector} with shape: {vector.shape}'
return vector
def _get_log_hu_moments(self) -> np.ndarray:
moments: dict = cv2.moments(self.particle.contour)
resultMoments: np.ndarray = np.zeros((7, 1))
for index, mom in enumerate(cv2.HuMoments(moments)):
if mom != 0:
resultMoments[index] = -1 * np.copysign(1.0, mom) * np.log10(abs(mom))
else:
resultMoments[index] = 0
return resultMoments[:, 0]
def _get_color_hash(self, color: str, desiredLength: int = 4) -> np.ndarray:
colorArray: list = [int(i) for i in str(abs(hash(color)))[:desiredLength]]
return np.transpose(np.array(colorArray))
This diff is collapsed.
import matplotlib.pyplot as plt
import numpy as np
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pickle
import time
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard import dataset
from gepard.analysis.particleContainer import ParticleContainer
from input_output import get_pkls_from_directory
from chemometricMethods import get_log_hu_moments, get_color_index, get_pca, get_characteristic_vector
from evaluation import is_MP_particle
def test_classification_models(dataset: tuple) -> None:
names = ["RandomForestClassifier", "NeuralNetClassifier"]
classifiers = [
RandomForestClassifier(n_estimators=1000),
MLPClassifier(alpha=1, max_iter=1000)]
t0 = time.time()
# preprocess dataset, split into training and test part
X, y = dataset
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.3, random_state=42)
print(f'prepocessng finished after {round(time.time()-t0, 2)} seconds')
# iterate over classifiers
for name, clf in zip(names, classifiers):
t0 = time.time()
clf.fit(X_train, y_train)
print(f'fitting {name} took {round(time.time()-t0, 2)} seconds')
t0 = time.time()
score = clf.score(X_test, y_test)
with open(f'{name}, score {round(score, 2)}.pkl', "wb") as fp:
pickle.dump(clf, fp, protocol=-1)
y_predicted = clf.predict(X_test)
print(f'finished getting score and prediction after {round(time.time() - t0, 2)} seconds')
errors: dict = {int(k): 0 for k in np.unique(y_test)}
for j in range(len(y_predicted)):
if y_test[j] != y_predicted[j]:
errors[y_test[j]] += 1
print(f'{name} with test size {len(y_test)} has score {round(score, 2)}, errors: {errors}')
if __name__ == '__main__':
recreateNew: bool = False
if recreateNew:
pklsInFolders: dict = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
X: list = []
y: list = []
counter = 0
for folder in pklsInFolders.keys():
for pklPath in pklsInFolders[folder]:
if counter < 100:
dset: dataset.DataSet = dataset.loadData(pklPath)
print('loaded', dset.name)
partContainer: ParticleContainer = dset.particleContainer
for particle in partContainer.particles:
features: np.ndarray = get_characteristic_vector(particle)
# features: list = [abs(i) for i in get_log_hu_moments(particle.contour)]
# features.append(get_color_index(particle.color))
X.append(features)
y.append(int(is_MP_particle(particle)))
counter += 1
X: np.ndarray = np.array(X)
y: np.ndarray = np.array(y)
MPindices: np.ndarray = np.where(y == 1)[0]
nonMPindices: np.ndarray = np.where(y == 0)[0]
nonMPindices: list = sample(list(nonMPindices), len(MPindices))
X_MP: list = list(X[MPindices])
y_MP: list = list(y[MPindices])
X_nonMP: list = list(X[nonMPindices])
y_nonMP: list = list(y[nonMPindices])
assert set(y_MP) == {1}
assert set(y_nonMP) == {0}
assert len(X_MP) == len(X_nonMP) == len(y_MP) == len(y_nonMP)
X_equalized: np.ndarray = np.array(X_MP + X_nonMP)
y_equalized: np.ndarray = np.array(y_MP + y_nonMP)
dset: tuple = (X_equalized, y_equalized)
with open('particleClassificaion.pkl', "wb") as fp:
pickle.dump(dset, fp, protocol=-1)
else:
with open('particleClassificaion.pkl', "rb") as fp:
dset: tuple = pickle.load(fp)
X, y = dset
with open(r'C:\Users\xbrjos\Desktop\Python\Subsampling\chemometrics\RandomForestClassifier, score 0.72.pkl', "rb") as fp:
clf: RandomForestClassifier = pickle.load(fp)
y_predicted = clf.predict(X)
# np.savetxt('Data.txt', X)
# np.savetxt('Assignments.txt', y)
# princComps = get_pca(X.transpose(), numComp=2)
#
# plt.scatter(princComps[:, 0], princComps[:, 1])
# print(X_equalized.shape)
# X: np.ndarray = SelectKBest(chi2, k=5).fit_transform(X, y)
# print(X_equalized.shape)
# test_classification_models((X, y))
...@@ -3,33 +3,35 @@ cimport numpy as np ...@@ -3,33 +3,35 @@ cimport numpy as np
cimport numpy.random cimport numpy.random
cimport cython cimport cython
from libc.math cimport sin, cos, round, abs
from libc.stdlib cimport rand, srand, RAND_MAX
DTYPE = np.float DTYPE = np.float
ctypedef np.int32_t INT32_t ctypedef np.int32_t INT32_t
cdef get_random_topleft(double maxDist, double maxAngle, double radius, double boxSize): cdef get_random_topleft(double maxDist, double maxAngle, double radius, double boxSize):
cdef double angle, dist, x, y cdef double angle, dist, x, y
cdef np.ndarray[INT32_t, ndim=1] newTopLeft cdef int newTopLeft[2]
dist = np.random.rand() * maxDist
angle = np.random.rand() * maxAngle
newTopLeft = np.empty(2, dtype=np.int32)
x = dist*np.cos(angle) + radius - boxSize/2
y = dist*np.sin(angle) + radius - boxSize/2
newTopLeft[0] = np.int32(np.round(x))
newTopLeft[1] = np.int32(np.round(y))
dist = rand() / 32767.0 * maxDist
angle = rand() / 32767.0 * maxAngle
x = dist*cos(angle) + radius - boxSize/2
y = dist*sin(angle) + radius - boxSize/2
newTopLeft[0] = int(round(x))
newTopLeft[1] = int(round(y))
return newTopLeft return newTopLeft
def get_random_topLefts(int numBoxes, double boxSize, double radius, double maxAngle, int seed=1337, int maxTries=50): def get_random_topLefts(int numBoxes, double boxSize, double radius, double maxAngle, int seed=1337, int maxTries=50):
cdef np.ndarray[INT32_t, ndim=2] topLefts cdef np.ndarray[INT32_t, ndim=2] topLefts
cdef np.ndarray[INT32_t, ndim=1] newTopLeft cdef int newTopLeft[2]
cdef double maxDist cdef double maxDist
cdef int outerCounter, counter, x, y, i, j, diffX, diffY, successfullyAdded cdef int outerCounter, counter, x, y, i, j, diffX, diffY, successfullyAdded
cdef bint validSolutionFound, boxOverlaps cdef bint validSolutionFound, boxOverlaps
np.random.seed(seed) srand(42) # setting seed
assert RAND_MAX == 32767 # this value is used in the random-function above. For performance-reasons, it is directly typed in there as a number
maxDist = radius - np.sqrt((boxSize/2)**2 + (boxSize/2)**2) maxDist = radius - np.sqrt((boxSize/2)**2 + (boxSize/2)**2)
outerCounter = 0 outerCounter = 0
validSolutionFound = False validSolutionFound = False
...@@ -47,8 +49,8 @@ def get_random_topLefts(int numBoxes, double boxSize, double radius, double maxA ...@@ -47,8 +49,8 @@ def get_random_topLefts(int numBoxes, double boxSize, double radius, double maxA
newTopLeft = get_random_topleft(maxDist, maxAngle, radius, boxSize) newTopLeft = get_random_topleft(maxDist, maxAngle, radius, boxSize)
boxOverlaps = False boxOverlaps = False
for j in range(i): for j in range(i):
diffX = abs(np.float(newTopLeft[0] - np.float(topLefts[j, 0]))) diffX = abs(newTopLeft[0] - topLefts[j, 0])
diffY = abs(np.float(newTopLeft[1] - np.float(topLefts[j, 1]))) diffY = abs(newTopLeft[1] - topLefts[j, 1])
if diffX < boxSize and diffY < boxSize: if diffX < boxSize and diffY < boxSize:
boxOverlaps = True boxOverlaps = True
......
...@@ -18,7 +18,7 @@ from gepard.analysis.particleAndMeasurement import Particle ...@@ -18,7 +18,7 @@ from gepard.analysis.particleAndMeasurement import Particle
from helpers import ParticleBinSorter from helpers import ParticleBinSorter
import methods as meth import methods as meth
import geometricMethods as gmeth import geometricMethods as gmeth
import chemometricMethods as cmeth from chemometrics import chemometricMethods as cmeth
from datasetOperations import ParticleVariations from datasetOperations import ParticleVariations
...@@ -26,14 +26,14 @@ def get_name_from_directory(dirPath: str) -> str: ...@@ -26,14 +26,14 @@ def get_name_from_directory(dirPath: str) -> str:
return str(os.path.basename(dirPath).split('.')[0]) return str(os.path.basename(dirPath).split('.')[0])
def get_methods_to_test(dataset: dataset.DataSet, fractions: list = []) -> list: def get_methods_to_test(dataset: dataset.DataSet, fractions: list = [], maxTries: int = 100) -> list:
""" """
:param fraction: The desired fraction to measure :param fraction: The desired fraction to measure
:return: list of measurement Objects that are applicable :return: list of measurement Objects that are applicable
""" """
if len(fractions) == 0: if len(fractions) == 0:
fractions: list = [0.02, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9] # fractions: list = [0.02, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]
# fractions: list = [0.02, 0.1, 0.5, 0.9] fractions: list = [0.1, 0.3, 0.5]
methods: list = [] methods: list = []
particleContainer = dataset.particleContainer particleContainer = dataset.particleContainer
...@@ -44,8 +44,9 @@ def get_methods_to_test(dataset: dataset.DataSet, fractions: list = []) -> list: ...@@ -44,8 +44,9 @@ def get_methods_to_test(dataset: dataset.DataSet, fractions: list = []) -> list:
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset) boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction) methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction) methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_randomBoxSubsamplers_for_fraction(fraction) methods += boxCreator.get_randomBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
methods += boxCreator.get_randomQuarterBoxSubsamplers_for_fraction(fraction) methods += boxCreator.get_randomQuarterBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
methods.append(cmeth.TrainedSubsampling(particleContainer, fraction))
# methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction)) # methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
return methods return methods
...@@ -91,27 +92,34 @@ class TotalResults(object): ...@@ -91,27 +92,34 @@ class TotalResults(object):
return newResult return newResult
def update_all(self, force: bool = False) -> None: def update_all(self, force: bool = False, multiprocessing: bool = True) -> None:
""" """
Updates all samples with all methods and all fractions Updates all samples with all methods and all fractions
:param force: Wether to force an update of an already existing method. :param force: Whether to force an update of an already existing method.
:param multiprocessing: Whether to spawn multiple processes for computation
:return: :return:
""" """
forceList: list = [force]*len(self.sampleResults) if multiprocessing:
indices: list = list(np.arange(len(self.sampleResults))) forceList: list = [force]*len(self.sampleResults)
numSamples: int = len(forceList) indices: list = list(np.arange(len(self.sampleResults)))
numWorkers: int = 4 # in case of quadcore processor that seams reasonable?? numSamples: int = len(forceList)
chunksize: int = int(round(numSamples / numWorkers * 0.7)) # we want to have slightly more chunks than workers numWorkers: int = 4 # in case of quadcore processor that seams reasonable??
print(f'multiprocessing with {numSamples} samples and chunksize of {chunksize}') chunksize: int = int(round(numSamples / numWorkers * 0.7)) # we want to have slightly more chunks than workers
print(f'multiprocessing with {numSamples} samples and chunksize of {chunksize}')
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(update_sample, self.sampleResults, forceList, indices, chunksize=chunksize) with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(update_sample, self.sampleResults, forceList, indices, chunksize=chunksize)
for index, res in enumerate(results):
updatedSample, processid = res for index, res in enumerate(results):
print(f'returned from process {processid}, iteration index {index}') updatedSample, processid = res
print(f'returned from process {processid}, iteration index {index}')
self.sampleResults[index] = updatedSample
self.sampleResults[index] = updatedSample
else:
for index, sampleResult in enumerate(self.sampleResults):
updatedResult, i = update_sample(sampleResult, True, index)
self.sampleResults[index] = updatedResult
print(f'done updating {updatedResult.dataset.name} at index {index}')
def get_error_vs_fraction_data(self, attributes: list = [], methods: list = []) -> dict: def get_error_vs_fraction_data(self, attributes: list = [], methods: list = []) -> dict:
""" """
...@@ -200,7 +208,12 @@ class SubsamplingResult(object): ...@@ -200,7 +208,12 @@ class SubsamplingResult(object):
:param subParticles: :param subParticles:
:return: :return:
""" """
error: float = self._get_mp_count_error(origParticles, subParticles, self.method.fraction) if type(self.method) == cmeth.TrainedSubsampling:
fraction = self.method.get_theoretic_frac()
else:
fraction = self.method.fraction
error: float = self._get_mp_count_error(origParticles, subParticles, fraction)
self.origParticleCount = len(origParticles) self.origParticleCount = len(origParticles)
self.mpCountErrors.append(error) self.mpCountErrors.append(error)
......
...@@ -169,11 +169,12 @@ class BoxSelectionCreator(object): ...@@ -169,11 +169,12 @@ class BoxSelectionCreator(object):
return spiralBoxSubsamplers return spiralBoxSubsamplers
def get_randomBoxSubsamplers_for_fraction(self, desiredFraction: float) -> list: def get_randomBoxSubsamplers_for_fraction(self, desiredFraction: float, maxTries: int = 100) -> list:
randomBoxSamplers: list = [] randomBoxSamplers: list = []
diameter, offset = self._get_diameter_and_offset() diameter, offset = self._get_diameter_and_offset()
randomBoxSampler: RandomBoxSampling = RandomBoxSampling(None, desiredFraction) randomBoxSampler: RandomBoxSampling = RandomBoxSampling(None, desiredFraction)
randomBoxSampler.maxTries = maxTries
randomBoxSampler.update_max_fractions() randomBoxSampler.update_max_fractions()
for numBoxes in randomBoxSampler.possibleBoxNumbers: for numBoxes in randomBoxSampler.possibleBoxNumbers:
randomBoxSampler.numBoxes = numBoxes randomBoxSampler.numBoxes = numBoxes
...@@ -186,10 +187,11 @@ class BoxSelectionCreator(object): ...@@ -186,10 +187,11 @@ class BoxSelectionCreator(object):
return randomBoxSamplers return randomBoxSamplers
def get_randomQuarterBoxSubsamplers_for_fraction(self, desiredFraction: float) -> list: def get_randomQuarterBoxSubsamplers_for_fraction(self, desiredFraction: float, maxTries: int = 100) -> list:
randomBoxSamplers: list = [] randomBoxSamplers: list = []
diameter, offset = self._get_diameter_and_offset() diameter, offset = self._get_diameter_and_offset()
randomBoxSampler: RandomQuarterBoxes = RandomQuarterBoxes(None, desiredFraction) randomBoxSampler: RandomQuarterBoxes = RandomQuarterBoxes(None, desiredFraction)
randomBoxSampler.maxTries = maxTries
randomBoxSampler.update_max_fractions() randomBoxSampler.update_max_fractions()
for numBoxes in randomBoxSampler.possibleBoxNumbers: for numBoxes in randomBoxSampler.possibleBoxNumbers:
...@@ -394,57 +396,14 @@ class RandomBoxSampling(BoxSelectionSubsamplingMethod): ...@@ -394,57 +396,14 @@ class RandomBoxSampling(BoxSelectionSubsamplingMethod):
return equals return equals
def get_topLeft_of_boxes(self) -> list: def get_topLeft_of_boxes(self) -> list:
#
# valid, topLefts = randoms.get_random_topLefts(self.numBoxes, self.boxSize, valid, topLefts = randoms.get_random_topLefts(self.numBoxes, self.boxSize,
# self.filterDiameter/2, self.__maxAngle, self.filterDiameter/2, self.__maxAngle,
# seed=self.randomSeed, maxTries=self.maxTries) seed=self.randomSeed, maxTries=self.maxTries)
# if not valid:
# if not valid:
# raise AttributeError
#
# topLefts: list = [[topLefts[i, 0], topLefts[i, 1]] for i in range(topLefts.shape[0])]
#
def get_random_topleft() -> list:
angle = np.random.rand() * self.__maxAngle
dist = np.random.rand() * maxDist
x: float = dist * np.cos(angle) + radius - boxSize / 2
y: float = dist * np.sin(angle) + radius - boxSize / 2
return [x, y]
np.random.seed(self.randomSeed)
topLefts: list = []
boxSize: float = self.boxSize
radius: float = self.filterDiameter / 2
maxDist: float = radius - np.sqrt((boxSize / 2) ** 2 + (boxSize / 2) ** 2)
outerCounter: int = 0
validSolutionFound: bool = False
while not validSolutionFound and outerCounter < self.maxTries:
topLefts = []
for i in range(self.numBoxes):
if i == 0:
topLefts.append(get_random_topleft())
else:
counter: int = 0
while counter < 50:
newTopLeft: list = get_random_topleft()
for topLeft2 in topLefts:
if box_overlaps_other_box(newTopLeft, topLeft2, boxSize):
break
else: # i.e., if no break occurred
topLefts.append(newTopLeft)
break
counter += 1
if len(topLefts) == self.numBoxes:
validSolutionFound = True
else:
outerCounter += 1
if not validSolutionFound:
raise AttributeError raise AttributeError
topLefts: list = [[topLefts[i, 0], topLefts[i, 1]] for i in range(topLefts.shape[0])]
return topLefts return topLefts
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
import pickle import pickle
from evaluation import TotalResults from evaluation import TotalResults
from helpers import timingDecorator from helpers import timingDecorator
from chemometrics.chemometricMethods import TrainedSubsampling
def load_results(fname: str) -> TotalResults: def load_results(fname: str) -> TotalResults:
res: TotalResults = None res: TotalResults = None
...@@ -17,6 +17,10 @@ def save_results(fname: str, result: TotalResults) -> None: ...@@ -17,6 +17,10 @@ def save_results(fname: str, result: TotalResults) -> None:
for sampleRes in result.sampleResults: for sampleRes in result.sampleResults:
storedDsets[sampleRes.sampleName] = sampleRes.dataset storedDsets[sampleRes.sampleName] = sampleRes.dataset
sampleRes.dataset = None sampleRes.dataset = None
for subsamplingRes in sampleRes.results:
subsamplingRes.method.particleContainer = None
if type(subsamplingRes.method) == TrainedSubsampling:
subsamplingRes.method.clf = None
with open(fname, "wb") as fp: with open(fname, "wb") as fp:
pickle.dump(result, fp, protocol=-1) pickle.dump(result, fp, protocol=-1)
......
...@@ -18,6 +18,15 @@ class SubsamplingMethod(object): ...@@ -18,6 +18,15 @@ class SubsamplingMethod(object):
self.particleContainer = particleConatainer self.particleContainer = particleConatainer
self.fraction: float = desiredFraction self.fraction: float = desiredFraction
# @property
# def fraction(self) -> float:
# """
# The TrainedSubsampling, e.g., changes its fraction depending on the quality of its training.
# All "regular" methods just return the desired Fraction.
# :return:
# """
# return self.desiredFraction
@property @property
def label(self) -> str: def label(self) -> str:
""" """
......
...@@ -14,25 +14,27 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH ...@@ -14,25 +14,27 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH
if __name__ == '__main__': if __name__ == '__main__':
results: TotalResults = TotalResults() results: TotalResults = TotalResults()
pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets') pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
counter = 0
for folder in pklsInFolders.keys(): for folder in pklsInFolders.keys():
for samplePath in pklsInFolders[folder]: for samplePath in pklsInFolders[folder]:
newSampleResult: SampleResult = results.add_sample(samplePath) if counter < 10:
for attr in get_attributes_from_foldername(folder): newSampleResult: SampleResult = results.add_sample(samplePath)
newSampleResult.set_attribute(attr) for attr in get_attributes_from_foldername(folder):
newSampleResult.set_attribute(attr)
counter += 1
t0 = time.time() t0 = time.time()
results.update_all() results.update_all(multiprocessing=False)
print('updating all took', time.time()-t0, 'seconds') print('updating all took', time.time()-t0, 'seconds')
save_results('results2.res', results) save_results('results_test.res', results)
# results: TotalResults = load_results('results1.res') # results: TotalResults = load_results('results2.res')
plot: Figure = get_error_vs_frac_plot(results, attributes=[[]],
methods=[['random', 'trained']], standarddevs=True)
# plot: Figure = get_error_vs_frac_plot(results, attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']], # plot: Figure = get_error_vs_frac_plot(results, attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']],
# methods=[['Boxes random']]*2) # methods=[['random layout (7', 'random layout (1']]*2)
# methods=[[]]*2)
# methods=[['Random Subsampling', 'Sizebin']] * 2) # methods=[['Random Subsampling', 'Sizebin']] * 2)
# methods=[['layout (7', 'layout (10', 'layout (15', 'cross', 'random subsampling', 'sizebin']] * 2)
plot: Figure = get_error_vs_frac_plot(results,
attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']],
methods=[['layout (7', 'layout (10', 'layout (15', 'cross', 'random subsampling', 'sizebin']] * 2)
plot.show() plot.show()
import numpy as np import numpy as np
import random
import sys import sys
sys.path.append("C://Users//xbrjos//Desktop//Python") sys.path.append("C://Users//xbrjos//Desktop//Python")
import gepard import gepard
from gepard.dataset import DataSet from gepard.dataset import DataSet
from gepard.analysis.particleContainer import ParticleContainer from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis.particleAndMeasurement import Particle, Measurement
def setMaxDim(dataset: DataSet, imgSize: float, minX: float, maxX: float, minY: float, maxY: float) -> None: def setMaxDim(dataset: DataSet, imgSize: float, minX: float, maxX: float, minY: float, maxY: float) -> None:
...@@ -27,5 +29,46 @@ def get_default_ParticleContainer() -> ParticleContainer: ...@@ -27,5 +29,46 @@ def get_default_ParticleContainer() -> ParticleContainer:
x = 10*i x = 10*i
contours.append(np.array([[[x, 0]], [[x+10, 0]], [[x+10, 10]], [[x, 10]]], dtype=np.int32)) contours.append(np.array([[[x, 0]], [[x+10, 0]], [[x+10, 10]], [[x, 10]]], dtype=np.int32))
particleContainer.setParticleContours(contours) particleContainer.setParticleContours(contours)
particleContainer.particles[0].color = 'red'
particleContainer.particles[1].color = 'blue'
particleContainer.particles[2].color = 'green'
particleContainer.particles[3].color = 'transparent'
return particleContainer return particleContainer
def get_MP_particles(numParticles) -> list:
mpParticles = []
for _ in range(numParticles):
mpParticles.append(get_MP_particle())
return mpParticles
def get_non_MP_particles(numParticles) -> list:
nonMPParticles = []
for _ in range(numParticles):
nonMPParticles.append(get_non_MP_particle())
return nonMPParticles
def get_MP_particle() -> Particle:
random.seed(15203018)
polymerNames = ['Poly (methyl methacrylate',
'Polyethylene',
'Silicone rubber',
'PB15',
'PY13',
'PR20']
polymName = random.sample(polymerNames, 1)[0]
newParticle: Particle = Particle()
newMeas = Measurement()
newMeas.setAssignment(polymName)
newParticle.addMeasurement(newMeas)
return newParticle
def get_non_MP_particle() -> Particle:
newParticle: Particle = Particle()
newParticle.addMeasurement(Measurement())
return newParticle
This diff is collapsed.
This diff is collapsed.
...@@ -320,6 +320,7 @@ class TestBoxCreator(unittest.TestCase): ...@@ -320,6 +320,7 @@ class TestBoxCreator(unittest.TestCase):
def test_get_randBoxSubsampler_for_Fraction(self): def test_get_randBoxSubsampler_for_Fraction(self):
randBoxSampler: RandomBoxSampling = RandomBoxSampling(None) randBoxSampler: RandomBoxSampling = RandomBoxSampling(None)
randBoxSampler.maxTries = 10
randBoxSampler.update_max_fractions() randBoxSampler.update_max_fractions()
maxFracs: dict = randBoxSampler.maxFractions maxFracs: dict = randBoxSampler.maxFractions
...@@ -331,7 +332,7 @@ class TestBoxCreator(unittest.TestCase): ...@@ -331,7 +332,7 @@ class TestBoxCreator(unittest.TestCase):
numValid += 1 numValid += 1
validNumBoxes.append(numBoxes) validNumBoxes.append(numBoxes)
possibleMehotds: list = self.boxCreator.get_randomBoxSubsamplers_for_fraction(frac) possibleMehotds: list = self.boxCreator.get_randomBoxSubsamplers_for_fraction(frac, maxTries=10)
self.assertEqual(len(possibleMehotds), numValid) self.assertEqual(len(possibleMehotds), numValid)
self._assert_correct_partCont(possibleMehotds) self._assert_correct_partCont(possibleMehotds)
for meth in possibleMehotds: for meth in possibleMehotds:
......
...@@ -15,7 +15,7 @@ from gepard.analysis.particleContainer import ParticleContainer ...@@ -15,7 +15,7 @@ from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis.particleAndMeasurement import Particle from gepard.analysis.particleAndMeasurement import Particle
from methods import SubsamplingMethod, RandomSampling, SizeBinFractioning from methods import SubsamplingMethod, RandomSampling, SizeBinFractioning
import geometricMethods as gmeth import geometricMethods as gmeth
import chemometricMethods as cmeth from chemometrics import chemometricMethods as cmeth
from helpers import ParticleBinSorter from helpers import ParticleBinSorter
...@@ -28,12 +28,13 @@ def get_default_particle_container(numParticles=1000): ...@@ -28,12 +28,13 @@ def get_default_particle_container(numParticles=1000):
class TestAllMethodsGeneric(unittest.TestCase): class TestAllMethodsGeneric(unittest.TestCase):
allMethodClasses : list = [RandomSampling, SizeBinFractioning, allMethodClasses : list = [RandomSampling, SizeBinFractioning,
gmeth.CrossBoxSubSampling, gmeth.SpiralBoxSubsampling, gmeth.CrossBoxSubSampling, gmeth.SpiralBoxSubsampling,
cmeth.ChemometricSubsampling] cmeth.TrainedSubsampling]
def setUp(self) -> None: def setUp(self) -> None:
partContainer: ParticleContainer = get_default_particle_container()
self.methods = [] self.methods = []
for methClass in self.allMethodClasses: for methClass in self.allMethodClasses:
self.methods.append(methClass(None, 0.1)) self.methods.append(methClass(partContainer, 0.1))
def test_basic_methods(self): def test_basic_methods(self):
for method in self.methods: for method in self.methods:
......