Commit 5d22e3c7 authored by Josef Brandt's avatar Josef Brandt

Converted KennardStone to Cython

parent 5dec089f
......@@ -6,3 +6,11 @@ __pycache__/
*.png
*.res
cythonModules/build/
*.c
*.pyd
*.html
......@@ -10,6 +10,7 @@ sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod
from cythonModules.kennardStone import find_furthest_indices
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
......@@ -27,7 +28,15 @@ class ChemometricSubsampling(SubsamplingMethod):
return 'Chemometric Selection'
def apply_subsampling_method(self) -> list:
return []
vectors: np.ndarray = self._get_particle_featurematrix()
kennardStone: KennardStone = KennardStone(vectors, self.fraction)
indices: list = kennardStone.get_sampled_indices()
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in selectedParticles:
selectedParticles.append(particle)
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray:
vectors: list = []
......@@ -65,59 +74,22 @@ class KennardStone(object):
self.fraction: float = desiredFraction
def get_sampled_indices(self) -> list:
"""
Adapted from https://github.com/karoka/Kennard-Stone-Algorithm/blob/master/kenStone.py
:return"""
t0 = time.time()
selectedIndices: set = set([])
numIndices: int = round(self.data.shape[0] * self.fraction)
if numIndices < 2:
raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
else:
distMat = spatial.distance_matrix(self.data, self.data)
i, j = np.unravel_index(distMat.argmax(), distMat.shape)
remainingIndices: set = set(np.arange(self.data.shape[0]))
selectedIndices.add(i)
selectedIndices.add(j)
remainingIndices.remove(i)
remainingIndices.remove(j)
for _ in range(numIndices-2):
minDist = 0.0
for j in remainingIndices:
dist = np.min([distMat[j][i] for i in selectedIndices])
if dist > minDist:
minj = j
minDist = dist
selectedIndices.add(minj)
remainingIndices.remove(minj)
selectedIndices = find_furthest_indices(distMat, int(numIndices), i, j)
assert len(np.unique(list(selectedIndices))) == len(selectedIndices)
print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds')
return list(selectedIndices)
# def get_sampled_indices(self) -> list:
# t0 = time.time()
# numIndices: int = round(self.data.shape[0] * self.fraction)
# if numIndices < 2:
# raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
# else:
# startInd = self._get_start_indices()
# selectedPoints = np.zeros((numIndices, 2))
# selectedPoints[0, :] = self.data[startInd[0], :]
# selectedPoints[1, :] = self.data[startInd[1], :]
#
# if numIndices > 2:
# data: np.ndarray = np.delete(self.data, startInd, 0)
# for i in range(numIndices-2):
# newIndex: int = self._get_point_furthest_from_other_points(selectedPoints[:i+2, :], data)
# selectedPoints[i+2, :] = data[newIndex, :]
# data = np.delete(data, newIndex, 0)
#
# selectedIndices = []
# assert numIndices == selectedPoints.shape[0]
# for i in range(numIndices):
# newInd = np.where(self.data == selectedPoints[i])[0][0]
# selectedIndices.append(newInd)
#
# assert len(np.unique(selectedIndices)) == len(selectedIndices)
# print('selecting indices time:', np.round(time.time()-t0, 2), 'seconds')
# return selectedIndices
def _get_start_indices(self) -> list:
"""
......
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.float_t DTYPE_t
ctypedef np.int32_t INT32_t
ctypedef np.int64_t INT64_t
def find_furthest_indices(np.ndarray[DTYPE_t, ndim=2] distMat, int numIndices, int index0, int index1):
cdef int i, j
cdef double dist, minDist, curDist
cdef np.ndarray[INT32_t, ndim=1] selectedIndices = np.empty(numIndices, dtype=np.int32)
cdef np.ndarray[INT32_t, ndim=1] remainingIndices = np.arange(numIndices, dtype=np.int32)
selectedIndices[0] = index0
selectedIndices[1] = index1
for i in range(numIndices-2):
minDist = 0.0
for j in remainingIndices:
dist = np.inf
for k in selectedIndices[:i+1]:
curDist = distMat[j][k]
if curDist < dist:
dist = curDist
if dist > minDist:
minj = j
minDist = dist
selectedIndices[i+2] = minj
remainingIndices = remainingIndices[remainingIndices!=minj]
return selectedIndices
\ No newline at end of file
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np
import sys
if len(sys.argv)==1:
sys.argv.append("build_ext")
sys.argv.append("--inplace")
ext = Extension("kennardStone", ["kennardStone.pyx"],
extra_compile_args=['-O3'],)
setup(
name = "kennardStone algorithm",
ext_modules = cythonize([ext], annotate=True), # accepts a glob pattern
include_dirs=[np.get_include()]
)
\ No newline at end of file
......@@ -53,17 +53,13 @@ class TestKennardStone(unittest.TestCase):
self.kennardStone: cmeth.KennardStone = cmeth.KennardStone(np.array([]), 0.1)
def test_get_sampled_indices(self):
numDataSets: int = 400
numDataSets: int = 1000
self.kennardStone.data = np.random.rand(numDataSets, 2)
self.kennardStone.fraction = 0.1
selectedIndices = self.kennardStone.get_sampled_indices()
self.assertEqual(len(selectedIndices), numDataSets*self.kennardStone.fraction)
self.assertEqual(len(np.unique(selectedIndices)), numDataSets*self.kennardStone.fraction)
plt.scatter(self.kennardStone.data[:, 0], self.kennardStone.data[:, 1])
plt.scatter(self.kennardStone.data[selectedIndices, 0], self.kennardStone.data[selectedIndices, 1])
plt.show()
self.kennardStone.fraction = 0.1
numDataSets = 2
self.kennardStone.data = np.random.rand(numDataSets, 2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment