diff --git a/.gitignore b/.gitignore index b323921c12dc2820ff34496cfa708430ce3529df..e2475a727cd464f919b1f36e4233495981c1138f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,11 @@ __pycache__/ *.png *.res + +cythonModules/build/ + +*.c + +*.pyd + +*.html diff --git a/chemometricMethods.py b/chemometricMethods.py index 1ff19826e80b7422debe1f3c9a2d84a127787e1d..583f63dd8897edd0efeb6ac7191d70f0522afb9d 100644 --- a/chemometricMethods.py +++ b/chemometricMethods.py @@ -10,6 +10,7 @@ sys.path.append("C://Users//xbrjos//Desktop//Python") from gepard.analysis.particleContainer import ParticleContainer from gepard.analysis import particleAndMeasurement as pm from methods import SubsamplingMethod +from cythonModules.kennardStone import find_furthest_indices def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray: @@ -27,7 +28,15 @@ class ChemometricSubsampling(SubsamplingMethod): return 'Chemometric Selection' def apply_subsampling_method(self) -> list: - return [] + vectors: np.ndarray = self._get_particle_featurematrix() + kennardStone: KennardStone = KennardStone(vectors, self.fraction) + indices: list = kennardStone.get_sampled_indices() + selectedParticles: list = [] + for particle in self.particleContainer.particles: + if particle.index in selectedParticles: + selectedParticles.append(particle) + + return selectedParticles def _get_particle_featurematrix(self) -> np.ndarray: vectors: list = [] @@ -65,59 +74,22 @@ class KennardStone(object): self.fraction: float = desiredFraction def get_sampled_indices(self) -> list: + """ + Adapted from https://github.com/karoka/Kennard-Stone-Algorithm/blob/master/kenStone.py + :return""" t0 = time.time() - selectedIndices: set = set([]) numIndices: int = round(self.data.shape[0] * self.fraction) if numIndices < 2: raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}') else: distMat = spatial.distance_matrix(self.data, self.data) + i, j = np.unravel_index(distMat.argmax(), distMat.shape) - remainingIndices: set = set(np.arange(self.data.shape[0])) - selectedIndices.add(i) - selectedIndices.add(j) - remainingIndices.remove(i) - remainingIndices.remove(j) - for _ in range(numIndices-2): - minDist = 0.0 - for j in remainingIndices: - dist = np.min([distMat[j][i] for i in selectedIndices]) - if dist > minDist: - minj = j - minDist = dist - selectedIndices.add(minj) - remainingIndices.remove(minj) + selectedIndices = find_furthest_indices(distMat, int(numIndices), i, j) assert len(np.unique(list(selectedIndices))) == len(selectedIndices) print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds') return list(selectedIndices) - # def get_sampled_indices(self) -> list: - # t0 = time.time() - # numIndices: int = round(self.data.shape[0] * self.fraction) - # if numIndices < 2: - # raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}') - # else: - # startInd = self._get_start_indices() - # selectedPoints = np.zeros((numIndices, 2)) - # selectedPoints[0, :] = self.data[startInd[0], :] - # selectedPoints[1, :] = self.data[startInd[1], :] - # - # if numIndices > 2: - # data: np.ndarray = np.delete(self.data, startInd, 0) - # for i in range(numIndices-2): - # newIndex: int = self._get_point_furthest_from_other_points(selectedPoints[:i+2, :], data) - # selectedPoints[i+2, :] = data[newIndex, :] - # data = np.delete(data, newIndex, 0) - # - # selectedIndices = [] - # assert numIndices == selectedPoints.shape[0] - # for i in range(numIndices): - # newInd = np.where(self.data == selectedPoints[i])[0][0] - # selectedIndices.append(newInd) - # - # assert len(np.unique(selectedIndices)) == len(selectedIndices) - # print('selecting indices time:', np.round(time.time()-t0, 2), 'seconds') - # return selectedIndices def _get_start_indices(self) -> list: """ diff --git a/cythonModules/kennardStone.pyx b/cythonModules/kennardStone.pyx new file mode 100644 index 0000000000000000000000000000000000000000..baa786bc32396d22d066733d3942a4800f22b4d1 --- /dev/null +++ b/cythonModules/kennardStone.pyx @@ -0,0 +1,36 @@ +import numpy as np +cimport numpy as np +cimport cython + +DTYPE = np.float +ctypedef np.float_t DTYPE_t +ctypedef np.int32_t INT32_t +ctypedef np.int64_t INT64_t + + +def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, int index0, int index1): + cdef int i, j + cdef double dist, minDist, curDist + cdef np.ndarray[INT32_t, ndim=1] selectedIndices = np.empty(numIndices, dtype=np.int32) + cdef np.ndarray[INT32_t, ndim=1] remainingIndices = np.arange(numIndices, dtype=np.int32) + + selectedIndices[0] = index0 + selectedIndices[1] = index1 + for i in range(numIndices-2): + minDist = 0.0 + for j in remainingIndices: + dist = np.inf + + for k in selectedIndices[:i+1]: + curDist = distMat[j][k] + if curDist < dist: + dist = curDist + + if dist > minDist: + minj = j + minDist = dist + + selectedIndices[i+2] = minj + remainingIndices = remainingIndices[remainingIndices!=minj] + + return selectedIndices \ No newline at end of file diff --git a/cythonModules/setuptKennardStone.py b/cythonModules/setuptKennardStone.py new file mode 100644 index 0000000000000000000000000000000000000000..093a3235e3c71c2e39dcc0990d4eb070d8f9d6cc --- /dev/null +++ b/cythonModules/setuptKennardStone.py @@ -0,0 +1,22 @@ +try: + from setuptools import setup + from setuptools import Extension +except ImportError: + from distutils.core import setup + from distutils.extension import Extension +from Cython.Build import cythonize +import numpy as np +import sys + +if len(sys.argv)==1: + sys.argv.append("build_ext") + sys.argv.append("--inplace") + +ext = Extension("kennardStone", ["kennardStone.pyx"], + extra_compile_args=['-O3'],) + +setup( + name = "kennardStone algorithm", + ext_modules = cythonize([ext], annotate=True), # accepts a glob pattern + include_dirs=[np.get_include()] +) \ No newline at end of file diff --git a/tests/test_chemometricMethods.py b/tests/test_chemometricMethods.py index a08129b42f4379d264605ab6fd3cf1cd64d5cd1a..e2df2f9698ee078e6fb032aaca2eddcae58d49a6 100644 --- a/tests/test_chemometricMethods.py +++ b/tests/test_chemometricMethods.py @@ -53,17 +53,13 @@ class TestKennardStone(unittest.TestCase): self.kennardStone: cmeth.KennardStone = cmeth.KennardStone(np.array([]), 0.1) def test_get_sampled_indices(self): - numDataSets: int = 400 + numDataSets: int = 1000 self.kennardStone.data = np.random.rand(numDataSets, 2) self.kennardStone.fraction = 0.1 selectedIndices = self.kennardStone.get_sampled_indices() self.assertEqual(len(selectedIndices), numDataSets*self.kennardStone.fraction) self.assertEqual(len(np.unique(selectedIndices)), numDataSets*self.kennardStone.fraction) - plt.scatter(self.kennardStone.data[:, 0], self.kennardStone.data[:, 1]) - plt.scatter(self.kennardStone.data[selectedIndices, 0], self.kennardStone.data[selectedIndices, 1]) - plt.show() - self.kennardStone.fraction = 0.1 numDataSets = 2 self.kennardStone.data = np.random.rand(numDataSets, 2)