chemometricMethods.py 8.34 KB
Newer Older
1 2 3 4 5 6
import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import spatial
from itertools import combinations
7
import time
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod


def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
    standardizedData = StandardScaler().fit_transform(data)
    pca = PCA(n_components=numComp)
    princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
    return princComp


class ChemometricSubsampling(SubsamplingMethod):
    def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
        super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)

    def label(self) -> str:
        return 'Chemometric Selection'

    def apply_subsampling_method(self) -> list:
        return []

    def _get_particle_featurematrix(self) -> np.ndarray:
        vectors: list = []
        for particle in self.particleContainer.particles:
            extractor: FeatureExtractor = FeatureExtractor(particle)
            vectors.append(extractor.get_characteristic_vector())
        vectors: np.ndarray = np.transpose(np.array(vectors))
        return vectors


class FeatureExtractor(object):
    def __init__(self, particle: pm.Particle):
        super(FeatureExtractor, self).__init__()
        self.particle: pm.Particle = particle

    def get_characteristic_vector(self) -> np.ndarray:
        return self._get_log_hu_moments()

    def _get_log_hu_moments(self) -> np.ndarray:
        moments: dict = cv2.moments(self.particle.contour)
        resultMoments: np.ndarray = np.zeros((7, 1))
        for index, mom in enumerate(cv2.HuMoments(moments)):
            if mom != 0:
                resultMoments[index] = -1 * np.copysign(1.0, mom) * np.log10(abs(mom))
            else:
                resultMoments[index] = 0

        return resultMoments[:, 0]


class KennardStone(object):
    def __init__(self, data: np.ndarray, desiredFraction: float):
        super(KennardStone, self).__init__()
        self.data: np.ndarray = data
        self.fraction: float = desiredFraction

    def get_sampled_indices(self) -> list:
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
        t0 = time.time()
        selectedIndices: set = set([])
        numIndices: int = round(self.data.shape[0] * self.fraction)
        if numIndices < 2:
            raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
        else:
            distMat = spatial.distance_matrix(self.data, self.data)
            i, j = np.unravel_index(distMat.argmax(), distMat.shape)
            remainingIndices: set = set(np.arange(self.data.shape[0]))
            selectedIndices.add(i)
            selectedIndices.add(j)
            remainingIndices.remove(i)
            remainingIndices.remove(j)
            for _ in range(numIndices-2):
                minDist = 0.0
                for j in remainingIndices:
                    dist = np.min([distMat[j][i] for i in selectedIndices])
                    if dist > minDist:
                        minj = j
                        minDist = dist
                selectedIndices.add(minj)
                remainingIndices.remove(minj)

        assert len(np.unique(list(selectedIndices))) == len(selectedIndices)
        print('selecting indices time:', np.round(time.time() - t0, 2), 'seconds')
        return list(selectedIndices)
    # def get_sampled_indices(self) -> list:
    #     t0 = time.time()
    #     numIndices: int = round(self.data.shape[0] * self.fraction)
    #     if numIndices < 2:
    #         raise ValueError(f'Not enough indices to generate (min = 2), requested: {numIndices}')
    #     else:
    #         startInd = self._get_start_indices()
    #         selectedPoints = np.zeros((numIndices, 2))
    #         selectedPoints[0, :] = self.data[startInd[0], :]
    #         selectedPoints[1, :] = self.data[startInd[1], :]
    #
    #         if numIndices > 2:
    #             data: np.ndarray = np.delete(self.data, startInd, 0)
    #             for i in range(numIndices-2):
    #                 newIndex: int = self._get_point_furthest_from_other_points(selectedPoints[:i+2, :], data)
    #                 selectedPoints[i+2, :] = data[newIndex, :]
    #                 data = np.delete(data, newIndex, 0)
    #
    #     selectedIndices = []
    #     assert numIndices == selectedPoints.shape[0]
    #     for i in range(numIndices):
    #         newInd = np.where(self.data == selectedPoints[i])[0][0]
    #         selectedIndices.append(newInd)
    #
    #     assert len(np.unique(selectedIndices)) == len(selectedIndices)
    #     print('selecting indices time:', np.round(time.time()-t0, 2), 'seconds')
    #     return selectedIndices
121 122 123 124 125 126 127

    def _get_start_indices(self) -> list:
        """
        Finds the indices of the points that are furthest apart.
        Adapted from https://stackoverflow.com/questions/50468643/finding-two-most-far-away-points-in-plot-with-many-points-in-python/50469147
        :return:
        """
128
        assert self.data.shape[1] == 2
129 130 131 132 133
        candidates = self.data[spatial.ConvexHull(self.data).vertices]
        dist_mat = spatial.distance_matrix(candidates, candidates)
        i, j = np.unravel_index(dist_mat.argmax(), dist_mat.shape)
        index1 = np.where(self.data == candidates[i])[0][0]
        index2 = np.where(self.data == candidates[j])[0][0]
134
        assert index1 != index2
135 136
        return sorted([index1, index2])

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
    # def _get_point_furthest_from_other_points(self, refPoints: np.ndarray, otherPoints: np.ndarray) -> int:
    #     assert refPoints.shape[1] == 2
    #     assert otherPoints.shape[1] == 2
    #     dist_mat = spatial.distance_matrix(refPoints, otherPoints)
    #     i, index = np.unravel_index(dist_mat.argmax(), dist_mat.shape)
    #     return index


class BoundingAreaHierarchy(object):
    def __init__(self, points: np.ndarray):
        super(BoundingAreaHierarchy, self).__init__()
        self.points: np.ndarray = points
        self.tree: BAHNode = BAHNode(points)
        self._populate_tree()

    # def _populate_tree(self) -> None:
    #


# class BAHNode(object):
#     def __init__(self, parent, startxy: tuple, width: float, height: float, points: np.ndarray) -> None:
#         super(BAHNode, self).__init__()
#         self.maxPointsPerNode: int = 10
#         self.parent: BAHNode = parent
#         self.children: list = []  # if empty, we reached the lowest node level
#         self.points: np.ndarray = np.array([])  # if empty, we are not at the lowest level and have to check children
#         self.x0: float = startxy[0]
#         self.y0: float = startxy[1]
#         self.width: float = width
#         self.height: float = height
#         self.x1: float = self.x0 + width
#         self.y1: float = self.y0 + height
#         self._create_child_nodes(points)
#
#     def _create_child_nodes(self, points:np.ndarray):
#         if points.shape[0] > 0:  # avoid testing for children in case of a testrun (empty array is provided as points)
#             pointsInNode: np.ndarray = self._get_points_in_area(points)
#             if pointsInNode.shape[0] > self.maxPointsPerNode:
#                 childWidth: float = self.width/2
#                 childHeight:float = self.height/2
#
#                 self.children.append(BAHNode(self, (self.x0, self.y1),
#                                              childWidth, childHeight, pointsInNode))
#
#                 self.children.append(BAHNode(self, (self.x0 + childWidth, self.y1),
#                                              childWidth, childHeight, pointsInNode))
#
#                 self.children.append(BAHNode(self, (self.x0, self.y1 + childHeight),
#                                              childWidth, childHeight, pointsInNode))
#
#                 self.children.append(BAHNode(self, (self.x0 + childWidth, self.y1 + childHeight),
#                                              childWidth, childHeight, pointsInNode))
#
#
#     def _get_points_in_area(self, points:np.ndarray) -> np.ndarray:
#         assert points.shape[1] == 2
#         cond1: np.ndarray = np.logical_and(points[:, 0] >= self.x0, points[:, 0] < self.x1)
#         cond2: np.ndarray = np.logical_and(points[:, 1] >= self.y0, points[:, 1] < self.y1)
#         return points[np.logical_and(cond1, cond2)]