Commit 66624cab authored by Josef Brandt's avatar Josef Brandt

Removed KennardStones, Added DBSCAN Clustering

parent 93f9a3fe
......@@ -2,24 +2,57 @@ import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy import spatial
from itertools import combinations
from random import sample
import time
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod
from cythonModules.kennardStone import find_furthest_indices
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
standardizedData = StandardScaler().fit_transform(data)
try:
standardizedData = StandardScaler().fit_transform(data)
except ValueError:
print('first standardscaler attempt failed, retrying..')
standardizedData = StandardScaler().fit_transform(data)
pca = PCA(n_components=numComp)
princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
return princComp
def do_DBSCAN_clustering(data: np.ndarray, eps: float = 0.1, min_samples: int = 10) -> tuple:
"""
Does DBSCAN clustering and finds noisy data
:param data: The input array
:param eps:
:param min_samples:
:return: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
"""
assert data.shape[1] == 2
standardizedData = StandardScaler().fit_transform(data)
db = DBSCAN(eps=eps, min_samples=min_samples).fit(standardizedData)
return db.labels_, db.core_sample_indices_
def get_n_points_closest_to_point(points: np.ndarray, n: int, refPoint: np.ndarray) -> list:
"""
Returns a list with indices of n points that are closest to the indicated refPoint
:param points: np.ndarray, cols: x, y, rows: individual points
:param n: number of points to return
:param refPoint: np.array([x, y]) of reference point
:return: list of point indices
"""
distancesToPoints: np.ndarray = np.linalg.norm(points - refPoint, axis=1)
sortedIndices = np.argsort(distancesToPoints)
return list(sortedIndices[:n])
class ChemometricSubsampling(SubsamplingMethod):
def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)
......@@ -31,8 +64,9 @@ class ChemometricSubsampling(SubsamplingMethod):
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
princComps: np.ndarray = get_pca(vectors)
kennardStone: KennardStone = KennardStone(princComps, self.fraction)
indices: list = kennardStone.get_sampled_indices()
clusterLabels, coreIndices = do_DBSCAN_clustering(princComps)
indices: list = self._get_indices_from_clusterLabels(princComps, clusterLabels, coreIndices)
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in indices:
......@@ -54,6 +88,92 @@ class ChemometricSubsampling(SubsamplingMethod):
equals = True
return equals
def _get_indices_from_clusterLabels(self, points: np.ndarray, labels: np.ndarray, centerIndices: np.ndarray) -> list:
indices: list = []
allIndices: np.ndarray = np.arange(len(labels))
numPointsPerCluster: dict = self._get_numPoints_per_cluster(labels)
for clusterIndex in set(labels):
indToAppend: list = []
nPoints: int = int(numPointsPerCluster[clusterIndex])
indicesInCluster: np.ndarray = allIndices[labels == clusterIndex]
if clusterIndex == -1:
indToAppend = sample(list(indicesInCluster), nPoints)
else:
clusterPoints: np.ndarray = points[indicesInCluster]
centerPoint: np.ndarray = np.mean(clusterPoints, axis=0)
indToAppend = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indToAppend:
indices.append(ind)
return indices
def _get_numPoints_per_cluster(self, labels: np.ndarray, noiseAmpFactor: float = 5) -> dict:
"""
MP Particles are expected to be the minority of all particles. So, if datapoints were classified as noise
(i.e., label = -1), it is likely that MP is in there. The abundancy of points taken from the noise is multiplied
by the noiseAmpFactor
:param labels:
:param noiseAmpFactor:
:return: A dictionary with keys = cluster index (i.e., label) and value = number of points to take from that
"""
pointsPerCluster: dict = {}
if type(labels) != np.ndarray:
labels = np.array(labels)
individualLabels: set = set(labels)
numPointsToSelect = round(len(labels) * self.fraction)
if numPointsToSelect == 0:
numPointsToSelect = 1
numNoisePoints = len(labels[labels == -1])
numClusteredPoints = len(labels) - numNoisePoints
# # get max noiseAmpFactor
if noiseAmpFactor > 1/self.fraction:
noiseAmpFactor = 1/self.fraction
numAmpPoints = numClusteredPoints + numNoisePoints*noiseAmpFactor
fractionPerCluster = np.clip(numPointsToSelect / numAmpPoints, 0.0, 1.0)
print('fractionPerCluster, self.fraction', fractionPerCluster, self.fraction)
tooFewPoints = numPointsToSelect < len(individualLabels)
totalPointsAdded = 0
for ind in individualLabels:
if ind > -1:
if not tooFewPoints:
pointsToAdd = round(fractionPerCluster * len(labels[labels == ind]))
else:
pointsToAdd = 1 if totalPointsAdded < numPointsToSelect else 0
pointsPerCluster[ind] = pointsToAdd
totalPointsAdded += pointsToAdd
# fill up the rest with noisePoints
if numNoisePoints > 0:
diff: float = np.clip(numPointsToSelect - totalPointsAdded, 0, numNoisePoints)
pointsPerCluster[-1] = diff
totalPointsAdded += diff
# just in case too many points were selected (due to rounding errors), keep on deleting until it matches
while totalPointsAdded > numPointsToSelect:
indexWithHighestCount = None
maxCount = 0
for index in pointsPerCluster.values():
if pointsPerCluster[index] > maxCount:
maxCount = pointsPerCluster[index]
indexWithHighestCount = index
pointsPerCluster[indexWithHighestCount] -= 1
totalPointsAdded -= 1
assert abs(totalPointsAdded - numPointsToSelect) <= 1
for clusterIndex in pointsPerCluster.keys():
assert 0 <= pointsPerCluster[clusterIndex] <= len(labels[labels == clusterIndex])
return pointsPerCluster
class FeatureExtractor(object):
def __init__(self, particle: pm.Particle):
......@@ -61,7 +181,9 @@ class FeatureExtractor(object):
self.particle: pm.Particle = particle
def get_characteristic_vector(self) -> np.ndarray:
return self._get_log_hu_moments()
log_hu: np.ndarray = self._get_log_hu_moments()
color: np.ndarray = self._get_color_hash(self.particle.color)
return np.transpose(np.hstack((log_hu, color)))
def _get_log_hu_moments(self) -> np.ndarray:
moments: dict = cv2.moments(self.particle.contour)
......@@ -74,43 +196,6 @@ class FeatureExtractor(object):
return resultMoments[:, 0]
class KennardStone(object):
def __init__(self, data: np.ndarray, desiredFraction: float):
super(KennardStone, self).__init__()
self.data: np.ndarray = data
self.fraction: float = desiredFraction
def get_sampled_indices(self) -> list:
"""
Adapted from https://github.com/karoka/Kennard-Stone-Algorithm/blob/master/kenStone.py
:return"""
t0 = time.time()
assert self.data.shape[1] == 2
numIndices: int = int(np.ceil(self.data.shape[0] * self.fraction))
if numIndices == 1:
selectedIndices = [int(round(self.data.shape[0]/2))]
else:
distMat = spatial.distance_matrix(self.data, self.data)
i, j = np.unravel_index(distMat.argmax(), distMat.shape)
selectedIndices = find_furthest_indices(distMat, int(numIndices), i, j)
assert len(np.unique(list(selectedIndices))) == len(selectedIndices)
print(f'selecting indices time: {np.round(time.time() - t0, 2)} seconds ({numIndices} out of {self.data.shape[0]})')
return list(selectedIndices)
def _get_start_indices(self) -> list:
"""
Finds the indices of the points that are furthest apart.
Adapted from https://stackoverflow.com/questions/50468643/finding-two-most-far-away-points-in-plot-with-many-points-in-python/50469147
:return:
"""
assert self.data.shape[1] == 2
candidates = self.data[spatial.ConvexHull(self.data).vertices]
dist_mat = spatial.distance_matrix(candidates, candidates)
i, j = np.unravel_index(dist_mat.argmax(), dist_mat.shape)
index1 = np.where(self.data == candidates[i])[0][0]
index2 = np.where(self.data == candidates[j])[0][0]
assert index1 != index2
return sorted([index1, index2])
def _get_color_hash(self, color: str, desiredLength: int = 4) -> np.ndarray:
colorArray: list = [int(i) for i in str(abs(hash(color)) % (10**desiredLength))]
return np.transpose(np.array(colorArray))
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
ctypedef np.int32_t INT32_t
ctypedef np.int64_t INT64_t
def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, int index0, int index1):
cdef int i, j, jidx, kidx
cdef double dist, minDist, curDist
cdef np.ndarray[INT32_t, ndim=1] selectedIndices = np.empty(numIndices, dtype=np.int32)
cdef np.ndarray[INT32_t, ndim=1] remainingIndices = np.arange(numIndices, dtype=np.int32)
cdef int numSelectedIndices = 2
cdef int numRemainingIndices = distMat.shape[0]-2
selectedIndices[0] = index0
selectedIndices[1] = index1
for i in range(numIndices-2):
minDist = 0.0
for jidx in range(remainingIndices.shape[0]):
j = remainingIndices[jidx]
dist = 1E6
for kidx in range(numSelectedIndices):
k = selectedIndices[kidx]
curDist = distMat[j, k]
if curDist < dist:
dist = curDist
if dist > minDist:
minj = j
minDist = dist
selectedIndices[i+2] = minj
numSelectedIndices += 1
remainingIndices = remainingIndices[remainingIndices!=minj]
return selectedIndices
\ No newline at end of file
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np
import sys
if len(sys.argv)==1:
sys.argv.append("build_ext")
sys.argv.append("--inplace")
ext = Extension("kennardStone", ["kennardStone.pyx"],
extra_compile_args=['-O3'],)
setup(
name = "kennardStone algorithm",
ext_modules = cythonize([ext], annotate=True), # accepts a glob pattern
include_dirs=[np.get_include()]
)
\ No newline at end of file
......@@ -24,9 +24,11 @@ def get_name_from_directory(dirPath: str) -> str:
class TotalResults(object):
methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
gmeth.SpiralBoxSubsampling]
measuredFreactions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
# methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
# gmeth.SpiralBoxSubsampling, cmeth.ChemometricSubsampling]
# measuredFractions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
# measuredFractions: list = [0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions: list = [0.1, 0.3, 0.5, 0.7, 0.9]
def __init__(self):
super(TotalResults, self).__init__()
......@@ -56,10 +58,10 @@ class TotalResults(object):
"""
for index, sample in enumerate(self.sampleResults):
sample.load_dataset()
for fraction in self.measuredFreactions:
for fraction in self.measuredFractions:
possibleMethods = self._get_methods_for_fraction(sample.dataset, fraction)
for curMethod in possibleMethods:
# print(f'updating {sample.sampleName} with {curMethod.label} at fraction {fraction}')
print(f'updating {sample.sampleName} with {curMethod.label} at fraction {fraction}')
sample.update_result_with_method(curMethod, force=force)
print(f'processed {index+1} of {len(self.sampleResults)} samples')
......@@ -108,8 +110,8 @@ class TotalResults(object):
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
if fraction <= 0.1:
methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
# methods = [cmeth.ChemometricSubsampling(particleContainer, fraction)]
return methods
......
......@@ -30,8 +30,8 @@ save_results('results1.res', results)
plt.clf()
errorPerFraction: dict = results.get_error_vs_fraction_data(attributes=['air', 'water'],
methods=['chemo', 'random'])
# methods=['spiral', 'cross'])
methods=['chemo', 'sizeBin', 'random'])
plt.subplot(121)
for methodLabel in errorPerFraction.keys():
fractions: list = list(errorPerFraction[methodLabel].keys())
......@@ -47,7 +47,7 @@ plt.ylabel('mpCountError (%)', fontsize=12)
plt.legend()
errorPerFraction: dict = results.get_error_vs_fraction_data(attributes=['sediment', 'soil', 'beach', 'slush'],
methods=['chemo', 'random'])
methods=['chemo', 'sizeBin', 'random'])
plt.subplot(122)
for methodLabel in errorPerFraction.keys():
fractions: list = list(errorPerFraction[methodLabel].keys())
......
This diff is collapsed.
......@@ -76,7 +76,7 @@ class TestTotalResults(unittest.TestCase):
possibleRandomMethods = 2
possibleCrossBoxMethods = 1
possibleSpiralBoxMethods = 0
possibleChemometricMethods = 0
possibleChemometricMethods = 1
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + \
possibleSpiralBoxMethods + possibleChemometricMethods
self.assertEqual(len(methods), totalPossible)
......@@ -90,7 +90,7 @@ class TestTotalResults(unittest.TestCase):
possibleRandomMethods = 2
possibleCrossBoxMethods = 0
possibleSpiralBoxMethods = 0
possibleChemometricMethods = 0
possibleChemometricMethods = 1
totalPossible = possibleCrossBoxMethods + possibleRandomMethods + \
possibleSpiralBoxMethods + possibleChemometricMethods
self.assertEqual(len(methods), totalPossible)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment