Commit ee19bc2c authored by Josef Brandt's avatar Josef Brandt

Kennard Stone Optimization, but now bug??

parent 5d22e3c7
......@@ -24,16 +24,18 @@ class ChemometricSubsampling(SubsamplingMethod):
def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)
@property
def label(self) -> str:
return 'Chemometric Selection'
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
kennardStone: KennardStone = KennardStone(vectors, self.fraction)
princComps: np.ndarray = get_pca(vectors)
kennardStone: KennardStone = KennardStone(princComps, self.fraction)
indices: list = kennardStone.get_sampled_indices()
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in selectedParticles:
if particle.index in indices:
selectedParticles.append(particle)
return selectedParticles
......@@ -46,6 +48,12 @@ class ChemometricSubsampling(SubsamplingMethod):
vectors: np.ndarray = np.transpose(np.array(vectors))
return vectors
def equals(self, otherMethod) -> bool:
equals: bool = False
if type(otherMethod) == ChemometricSubsampling and otherMethod.fraction == self.fraction:
equals = True
return equals
class FeatureExtractor(object):
def __init__(self, particle: pm.Particle):
......@@ -78,9 +86,10 @@ class KennardStone(object):
Adapted from https://github.com/karoka/Kennard-Stone-Algorithm/blob/master/kenStone.py
:return"""
t0 = time.time()
numIndices: int = round(self.data.shape[0] * self.fraction)
if numIndices < 2:
raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
assert self.data.shape[1] == 2
numIndices: int = int(np.ceil(self.data.shape[0] * self.fraction))
if numIndices == 1:
selectedIndices = [int(round(self.data.shape[0]/2))]
else:
distMat = spatial.distance_matrix(self.data, self.data)
......@@ -88,7 +97,7 @@ class KennardStone(object):
selectedIndices = find_furthest_indices(distMat, int(numIndices), i, j)
assert len(np.unique(list(selectedIndices))) == len(selectedIndices)
print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds')
print(f'selecting indices time: {np.round(time.time() - t0, 2)} seconds ({numIndices} out of {self.data.shape[0]})')
return list(selectedIndices)
def _get_start_indices(self) -> list:
......
......@@ -9,20 +9,25 @@ ctypedef np.int64_t INT64_t
def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, int index0, int index1):
cdef int i, j
cdef int i, j, jidx, kidx
cdef double dist, minDist, curDist
cdef np.ndarray[INT32_t, ndim=1] selectedIndices = np.empty(numIndices, dtype=np.int32)
cdef np.ndarray[INT32_t, ndim=1] remainingIndices = np.arange(numIndices, dtype=np.int32)
cdef int numSelectedIndices = 2
cdef int numRemainingIndices = distMat.shape[0]-2
selectedIndices[0] = index0
selectedIndices[1] = index1
for i in range(numIndices-2):
minDist = 0.0
for j in remainingIndices:
dist = np.inf
for jidx in range(remainingIndices.shape[0]):
j = remainingIndices[jidx]
dist = 1E6
for k in selectedIndices[:i+1]:
curDist = distMat[j][k]
for kidx in range(numSelectedIndices):
k = selectedIndices[kidx]
curDist = distMat[j, k]
if curDist < dist:
dist = curDist
......@@ -31,6 +36,7 @@ def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, i
minDist = dist
selectedIndices[i+2] = minj
numSelectedIndices += 1
remainingIndices = remainingIndices[remainingIndices!=minj]
return selectedIndices
\ No newline at end of file
......@@ -14,6 +14,7 @@ import matplotlib.pyplot as plt
from helpers import ParticleBinSorter
import methods as meth
import geometricMethods as gmeth
import chemometricMethods as cmeth
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard import dataset
......@@ -25,7 +26,7 @@ def get_name_from_directory(dirPath: str) -> str:
class TotalResults(object):
methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
gmeth.SpiralBoxSubsampling]
measuredFreactions: list = [0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFreactions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
def __init__(self):
super(TotalResults, self).__init__()
......@@ -107,6 +108,8 @@ class TotalResults(object):
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
if fraction <= 0.1:
methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
return methods
......
......@@ -53,7 +53,7 @@ class TestKennardStone(unittest.TestCase):
self.kennardStone: cmeth.KennardStone = cmeth.KennardStone(np.array([]), 0.1)
def test_get_sampled_indices(self):
numDataSets: int = 1000
numDataSets: int = 4000
self.kennardStone.data = np.random.rand(numDataSets, 2)
self.kennardStone.fraction = 0.1
selectedIndices = self.kennardStone.get_sampled_indices()
......@@ -63,10 +63,19 @@ class TestKennardStone(unittest.TestCase):
self.kennardStone.fraction = 0.1
numDataSets = 2
self.kennardStone.data = np.random.rand(numDataSets, 2)
self.assertRaises(ValueError, self.kennardStone.get_sampled_indices)
selectedIndices = self.kennardStone.get_sampled_indices()
self.assertEqual(len(selectedIndices), 1)
self.kennardStone.fraction = 0.5
numDataSets = 2
self.kennardStone.data = np.random.rand(numDataSets, 2)
selectedIndices = self.kennardStone.get_sampled_indices()
self.assertEqual(len(selectedIndices), numDataSets * self.kennardStone.fraction)
self.assertEqual(len(np.unique(selectedIndices)), numDataSets * self.kennardStone.fraction)
numDataSets = 20
self.kennardStone.data = np.random.rand(numDataSets, 2)
self.kennardStone.fraction = 0.1
selectedIndices = self.kennardStone.get_sampled_indices()
self.assertEqual(len(selectedIndices), 2)
self.assertEqual(len(np.unique(selectedIndices)), 2)
......@@ -120,13 +129,13 @@ class TestChemometricSubsampling(unittest.TestCase):
particleContainer.setParticleContours([contours[0] for i in range(numParticles)])
self.chemSubs: cmeth.ChemometricSubsampling = cmeth.ChemometricSubsampling(particleContainer, desiredFraction=0.1)
def test_get_particle_featurematrix(self):
features: np.ndarray = self.chemSubs._get_particle_featurematrix()
self.assertEqual(features.shape, (7, 5))
for i in range(6):
diff: np.ndarray = features[i, :] - np.mean(features[i, :])
self.assertFalse(np.any(diff > 0.1))
# def test_get_particle_featurematrix(self):
# features: np.ndarray = self.chemSubs._get_particle_featurematrix()
# self.assertEqual(features.shape, (7, 5))
# for i in range(6):
# diff: np.ndarray = features[i, :] - np.mean(features[i, :])
# self.assertFalse(np.any(diff > 0.1))
# def test_pca(self):
# fname = r'C:\Users\xbrjos\Desktop\temp MP\190326_MCII_WWTP_SB_50_1\190326_MCII_WWTP_SB_50_1.pkl'
# fname = r'C:\Users\xbrjos\Desktop\temp MP\190313_Soil_5_A_50_5_1_50_1\190313_Soil_5_A_50_5_1_50_1.pkl'
......@@ -137,7 +146,7 @@ class TestChemometricSubsampling(unittest.TestCase):
# plt.scatter(princComp[:, 0], princComp[:, 1])
# plt.title(dset.name)
# plt.show()
#
# class TestBAH(unittest.TestCase):
# # def setUp(self) -> None:
......
......@@ -62,7 +62,9 @@ class TestTotalResults(unittest.TestCase):
possibleRandomMethods = 2
possibleCrossBoxMethods = 2
possibleSpiralBoxMethods = 3
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + possibleSpiralBoxMethods
possibleChemometricMethods = 1
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + \
possibleSpiralBoxMethods + possibleChemometricMethods
self.assertEqual(len(methods), totalPossible)
self.assertTrue(containsMethod(methods, meth.RandomSampling(dset, desiredFraction)))
self.assertTrue(containsMethod(methods, meth.SizeBinFractioning(dset, desiredFraction)))
......@@ -74,7 +76,9 @@ class TestTotalResults(unittest.TestCase):
possibleRandomMethods = 2
possibleCrossBoxMethods = 1
possibleSpiralBoxMethods = 0
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + possibleSpiralBoxMethods
possibleChemometricMethods = 0
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + \
possibleSpiralBoxMethods + possibleChemometricMethods
self.assertEqual(len(methods), totalPossible)
self.assertTrue(containsMethod(methods, meth.RandomSampling(dset, desiredFraction)))
self.assertTrue(containsMethod(methods, meth.SizeBinFractioning(dset, desiredFraction)))
......@@ -86,7 +90,9 @@ class TestTotalResults(unittest.TestCase):
possibleRandomMethods = 2
possibleCrossBoxMethods = 0
possibleSpiralBoxMethods = 0
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + possibleSpiralBoxMethods
possibleChemometricMethods = 0
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + \
possibleSpiralBoxMethods + possibleChemometricMethods
self.assertEqual(len(methods), totalPossible)
self.assertTrue(containsMethod(methods, meth.RandomSampling(dset, desiredFraction)))
self.assertTrue(containsMethod(methods, meth.SizeBinFractioning(dset, desiredFraction)))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment