Commit 5d22e3c7 authored by Josef Brandt's avatar Josef Brandt

Converted KennardStone to Cython

parent 5dec089f
...@@ -6,3 +6,11 @@ __pycache__/ ...@@ -6,3 +6,11 @@ __pycache__/
*.png *.png
*.res *.res
cythonModules/build/
*.c
*.pyd
*.html
...@@ -10,6 +10,7 @@ sys.path.append("C://Users//xbrjos//Desktop//Python") ...@@ -10,6 +10,7 @@ sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod from methods import SubsamplingMethod
from cythonModules.kennardStone import find_furthest_indices
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray: def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
...@@ -27,7 +28,15 @@ class ChemometricSubsampling(SubsamplingMethod): ...@@ -27,7 +28,15 @@ class ChemometricSubsampling(SubsamplingMethod):
return 'Chemometric Selection' return 'Chemometric Selection'
def apply_subsampling_method(self) -> list: def apply_subsampling_method(self) -> list:
return [] vectors: np.ndarray = self._get_particle_featurematrix()
kennardStone: KennardStone = KennardStone(vectors, self.fraction)
indices: list = kennardStone.get_sampled_indices()
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in selectedParticles:
selectedParticles.append(particle)
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray: def _get_particle_featurematrix(self) -> np.ndarray:
vectors: list = [] vectors: list = []
...@@ -65,59 +74,22 @@ class KennardStone(object): ...@@ -65,59 +74,22 @@ class KennardStone(object):
self.fraction: float = desiredFraction self.fraction: float = desiredFraction
def get_sampled_indices(self) -> list: def get_sampled_indices(self) -> list:
"""
Adapted from https://github.com/karoka/Kennard-Stone-Algorithm/blob/master/kenStone.py
:return"""
t0 = time.time() t0 = time.time()
selectedIndices: set = set([])
numIndices: int = round(self.data.shape[0] * self.fraction) numIndices: int = round(self.data.shape[0] * self.fraction)
if numIndices < 2: if numIndices < 2:
raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}') raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
else: else:
distMat = spatial.distance_matrix(self.data, self.data) distMat = spatial.distance_matrix(self.data, self.data)
i, j = np.unravel_index(distMat.argmax(), distMat.shape) i, j = np.unravel_index(distMat.argmax(), distMat.shape)
remainingIndices: set = set(np.arange(self.data.shape[0])) selectedIndices = find_furthest_indices(distMat, int(numIndices), i, j)
selectedIndices.add(i)
selectedIndices.add(j)
remainingIndices.remove(i)
remainingIndices.remove(j)
for _ in range(numIndices-2):
minDist = 0.0
for j in remainingIndices:
dist = np.min([distMat[j][i] for i in selectedIndices])
if dist > minDist:
minj = j
minDist = dist
selectedIndices.add(minj)
remainingIndices.remove(minj)
assert len(np.unique(list(selectedIndices))) == len(selectedIndices) assert len(np.unique(list(selectedIndices))) == len(selectedIndices)
print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds') print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds')
return list(selectedIndices) return list(selectedIndices)
# def get_sampled_indices(self) -> list:
# t0 = time.time()
# numIndices: int = round(self.data.shape[0] * self.fraction)
# if numIndices < 2:
# raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
# else:
# startInd = self._get_start_indices()
# selectedPoints = np.zeros((numIndices, 2))
# selectedPoints[0, :] = self.data[startInd[0], :]
# selectedPoints[1, :] = self.data[startInd[1], :]
#
# if numIndices > 2:
# data: np.ndarray = np.delete(self.data, startInd, 0)
# for i in range(numIndices-2):
# newIndex: int = self._get_point_furthest_from_other_points(selectedPoints[:i+2, :], data)
# selectedPoints[i+2, :] = data[newIndex, :]
# data = np.delete(data, newIndex, 0)
#
# selectedIndices = []
# assert numIndices == selectedPoints.shape[0]
# for i in range(numIndices):
# newInd = np.where(self.data == selectedPoints[i])[0][0]
# selectedIndices.append(newInd)
#
# assert len(np.unique(selectedIndices)) == len(selectedIndices)
# print('selecting indices time:', np.round(time.time()-t0, 2), 'seconds')
# return selectedIndices
def _get_start_indices(self) -> list: def _get_start_indices(self) -> list:
""" """
......
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
ctypedef np.int32_t INT32_t
ctypedef np.int64_t INT64_t
def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, int index0, int index1):
cdef int i, j
cdef double dist, minDist, curDist
cdef np.ndarray[INT32_t, ndim=1] selectedIndices = np.empty(numIndices, dtype=np.int32)
cdef np.ndarray[INT32_t, ndim=1] remainingIndices = np.arange(numIndices, dtype=np.int32)
selectedIndices[0] = index0
selectedIndices[1] = index1
for i in range(numIndices-2):
minDist = 0.0
for j in remainingIndices:
dist = np.inf
for k in selectedIndices[:i+1]:
curDist = distMat[j][k]
if curDist < dist:
dist = curDist
if dist > minDist:
minj = j
minDist = dist
selectedIndices[i+2] = minj
remainingIndices = remainingIndices[remainingIndices!=minj]
return selectedIndices
\ No newline at end of file
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np
import sys
if len(sys.argv)==1:
sys.argv.append("build_ext")
sys.argv.append("--inplace")
ext = Extension("kennardStone", ["kennardStone.pyx"],
extra_compile_args=['-O3'],)
setup(
name = "kennardStone algorithm",
ext_modules = cythonize([ext], annotate=True), # accepts a glob pattern
include_dirs=[np.get_include()]
)
\ No newline at end of file
...@@ -53,17 +53,13 @@ class TestKennardStone(unittest.TestCase): ...@@ -53,17 +53,13 @@ class TestKennardStone(unittest.TestCase):
self.kennardStone: cmeth.KennardStone = cmeth.KennardStone(np.array([]), 0.1) self.kennardStone: cmeth.KennardStone = cmeth.KennardStone(np.array([]), 0.1)
def test_get_sampled_indices(self): def test_get_sampled_indices(self):
numDataSets: int = 400 numDataSets: int = 1000
self.kennardStone.data = np.random.rand(numDataSets, 2) self.kennardStone.data = np.random.rand(numDataSets, 2)
self.kennardStone.fraction = 0.1 self.kennardStone.fraction = 0.1
selectedIndices = self.kennardStone.get_sampled_indices() selectedIndices = self.kennardStone.get_sampled_indices()
self.assertEqual(len(selectedIndices), numDataSets*self.kennardStone.fraction) self.assertEqual(len(selectedIndices), numDataSets*self.kennardStone.fraction)
self.assertEqual(len(np.unique(selectedIndices)), numDataSets*self.kennardStone.fraction) self.assertEqual(len(np.unique(selectedIndices)), numDataSets*self.kennardStone.fraction)
plt.scatter(self.kennardStone.data[:, 0], self.kennardStone.data[:, 1])
plt.scatter(self.kennardStone.data[selectedIndices, 0], self.kennardStone.data[selectedIndices, 1])
plt.show()
self.kennardStone.fraction = 0.1 self.kennardStone.fraction = 0.1
numDataSets = 2 numDataSets = 2
self.kennardStone.data = np.random.rand(numDataSets, 2) self.kennardStone.data = np.random.rand(numDataSets, 2)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment