Commit 23105fb5 authored by Josef Brandt's avatar Josef Brandt

Merge branch 'ResultGeneration'

parents f91aed7d a8488af0
......@@ -6,3 +6,11 @@ __pycache__/
*.png
*.res
cythonModules/build/
*.c
*.pyd
*.html
import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy import spatial
from itertools import combinations
from random import sample
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
try:
standardizedData = StandardScaler().fit_transform(data.copy())
except ValueError:
print('first standardscaler attempt failed, retrying..')
print('datashape', data.shape)
print('unique:', np.unique(data))
raise
pca = PCA(n_components=numComp)
princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
return princComp
def do_DBSCAN_clustering(data: np.ndarray, eps: float = 0.1, min_samples: int = 10) -> tuple:
"""
Does DBSCAN clustering and finds noisy data
:param data: The input array
:param eps:
:param min_samples:
:return: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
"""
assert data.shape[1] == 2
standardizedData = StandardScaler().fit_transform(data)
db = DBSCAN(eps=eps, min_samples=min_samples).fit(standardizedData)
return db.labels_, db.core_sample_indices_
def get_n_points_closest_to_point(points: np.ndarray, n: int, refPoint: np.ndarray) -> list:
"""
Returns a list with indices of n points that are closest to the indicated refPoint
:param points: np.ndarray, cols: x, y, rows: individual points
:param n: number of points to return
:param refPoint: np.array([x, y]) of reference point
:return: list of point indices
"""
distancesToPoints: np.ndarray = np.linalg.norm(points - refPoint, axis=1)
sortedIndices = np.argsort(distancesToPoints)
return list(sortedIndices[:n])
class ChemometricSubsampling(SubsamplingMethod):
def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)
@property
def label(self) -> str:
return 'Chemometric Selection'
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
try:
princComps: np.ndarray = get_pca(vectors)
except ValueError:
print('numParticles:', len(self.particleContainer.particles))
print('input featurematrix shape', vectors.shape)
clusterLabels, coreIndices = do_DBSCAN_clustering(princComps)
indices: list = self._get_indices_from_clusterLabels(princComps, clusterLabels, coreIndices)
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in indices:
selectedParticles.append(particle)
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray:
"""
:return: np.ndarray, numRows: Features, numCols: Particles
"""
vectors: list = []
for particle in self.particleContainer.particles:
extractor: FeatureExtractor = FeatureExtractor(particle)
vectors.append(extractor.get_characteristic_vector())
vectors: np.ndarray = np.transpose(np.array(vectors))
assert vectors.shape == (11, len(self.particleContainer.particles)), f'wrong featureMat-shape: {vectors.shape}'
return vectors
def equals(self, otherMethod) -> bool:
equals: bool = False
if type(otherMethod) == ChemometricSubsampling and otherMethod.fraction == self.fraction:
equals = True
return equals
def _get_indices_from_clusterLabels(self, points: np.ndarray, labels: np.ndarray, centerIndices: np.ndarray) -> list:
indices: list = []
allIndices: np.ndarray = np.arange(len(labels))
numPointsPerCluster: dict = self._get_numPoints_per_cluster(labels)
for clusterIndex in set(labels):
indToAppend: list = []
nPoints: int = int(numPointsPerCluster[clusterIndex])
indicesInCluster: np.ndarray = allIndices[labels == clusterIndex]
if clusterIndex == -1:
for ind in sample(list(indicesInCluster), nPoints):
# assert ind not in indices
indices.append(ind)
else:
clusterPoints: np.ndarray = points[indicesInCluster]
centerPoint: np.ndarray = np.mean(clusterPoints, axis=0)
indicesToSelect: list = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indicesToSelect:
origInd = indicesInCluster[ind]
indices.append(origInd)
assert len(set(indices)) == len(indices), f'The calculated indices contain duplicates, ' \
f'num duplicates: {len(indices) - len(set(indices))}'
return indices
def _get_numPoints_per_cluster(self, labels: np.ndarray, noiseAmpFactor: float = 5) -> dict:
"""
MP Particles are expected to be the minority of all particles. So, if datapoints were classified as noise
(i.e., label = -1), it is likely that MP is in there. The abundancy of points taken from the noise is multiplied
by the noiseAmpFactor
:param labels:
:param noiseAmpFactor:
:return: A dictionary with keys = cluster index (i.e., label) and value = number of points to take from that
"""
pointsPerCluster: dict = {}
if type(labels) != np.ndarray:
labels = np.array(labels)
individualLabels: set = set(labels)
numPointsToSelect = round(len(labels) * self.fraction)
if numPointsToSelect == 0:
numPointsToSelect = 1
numNoisePoints = len(labels[labels == -1])
numClusteredPoints = len(labels) - numNoisePoints
# # get max noiseAmpFactor
if noiseAmpFactor > 1/self.fraction:
noiseAmpFactor = 1/self.fraction
numAmpPoints = numClusteredPoints + numNoisePoints*noiseAmpFactor
fractionPerCluster = np.clip(numPointsToSelect / numAmpPoints, 0.0, 1.0)
tooFewPoints = numPointsToSelect < len(individualLabels)
totalPointsAdded = 0
for ind in individualLabels:
if ind > -1:
if not tooFewPoints:
pointsToAdd = round(fractionPerCluster * len(labels[labels == ind]))
else:
pointsToAdd = 1 if totalPointsAdded < numPointsToSelect else 0
pointsPerCluster[ind] = pointsToAdd
totalPointsAdded += pointsToAdd
# fill up the rest with noisePoints
if numNoisePoints > 0:
diff: float = np.clip(numPointsToSelect - totalPointsAdded, 0, numNoisePoints)
pointsPerCluster[-1] = diff
totalPointsAdded += diff
# just in case too many points were selected (due to rounding errors), keep on deleting until it matches
while totalPointsAdded > numPointsToSelect:
indexWithHighestCount = None
maxCount = 0
for index in pointsPerCluster.values():
if pointsPerCluster[index] > maxCount:
maxCount = pointsPerCluster[index]
indexWithHighestCount = index
pointsPerCluster[indexWithHighestCount] -= 1
totalPointsAdded -= 1
if not abs(totalPointsAdded - numPointsToSelect) <= 1:
print('error')
# assert abs(totalPointsAdded - numPointsToSelect) <= 1
for clusterIndex in pointsPerCluster.keys():
assert 0 <= pointsPerCluster[clusterIndex] <= len(labels[labels == clusterIndex])
return pointsPerCluster
class FeatureExtractor(object):
def __init__(self, particle: pm.Particle):
super(FeatureExtractor, self).__init__()
self.particle: pm.Particle = particle
def get_characteristic_vector(self) -> np.ndarray:
log_hu: np.ndarray = self._get_log_hu_moments()
color: np.ndarray = self._get_color_hash(self.particle.color, desiredLength=4)
vector: np.ndarray = np.hstack((log_hu, color))
if len(vector) != 11:
print('error')
assert len(vector) == 7 + 4, f'wrong feature vector: {vector} with shape: {vector.shape}'
return vector
def _get_log_hu_moments(self) -> np.ndarray:
moments: dict = cv2.moments(self.particle.contour)
resultMoments: np.ndarray = np.zeros((7, 1))
for index, mom in enumerate(cv2.HuMoments(moments)):
if mom != 0:
resultMoments[index] = -1 * np.copysign(1.0, mom) * np.log10(abs(mom))
else:
resultMoments[index] = 0
return resultMoments[:, 0]
def _get_color_hash(self, color: str, desiredLength: int = 4) -> np.ndarray:
colorArray: list = [int(i) for i in str(abs(hash(color)))[:desiredLength]]
return np.transpose(np.array(colorArray))
import numpy as np
cimport numpy as np
cimport cython
DTYPE = np.float
ctypedef np.int32_t INT32_t
def rotate_contour_around_point(np.ndarray[INT32_t, ndim=3] contour, np.ndarray[INT32_t, ndim=1] refPoint, np.float angleDegree):
"""
Rotates a point around another one... All coordinates in pixel space (integers)
:param contour: Array of points to be rotated, [:, 0, 0] = x, [:, 0, 1] = y
:param refPoint: The referemce point around which the first point is rotated, tuple of x and y
:param angleDegree: The angle in degree to rotate (counter-clockwise)
:return: Array of the rotated point, [:, 0, 0] = x, [:, 0, 1] = y
"""
cdef int i
cdef double theta, sin, cos, x, y
cdef np.ndarray[INT32_t, ndim=3] newContour
theta = np.deg2rad(angleDegree)
sin = np.sin(theta)
cos = np.cos(theta)
newContour = np.zeros_like(contour)
for i in range(contour.shape[0]):
x = cos * (contour[i, 0, 0]-refPoint[0]) - sin * (contour[i, 0, 1]-refPoint[1]) + refPoint[0]
y = sin * (contour[i, 0, 0]-refPoint[0]) + cos * (contour[i, 0, 1]-refPoint[1]) + refPoint[1]
newContour[i, 0, 0] = round(x)
newContour[i, 0, 1] = round(y)
return newContour
# try:
from setuptools import setup
from setuptools import Extension
from Cython.Build import cythonize
import numpy as np
import sys
if len(sys.argv) == 1:
sys.argv.append("build_ext")
sys.argv.append("--inplace")
ext = Extension("rotateContour", ["rotateContour.pyx"], extra_compile_args=['-O3'],)
setup(
name="rotate contour around reference point",
ext_modules=cythonize([ext], annotate=True), # accepts a glob pattern
include_dirs=[np.get_include()]
)
\ No newline at end of file
import copy
import numpy as np
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from cythonModules import rotateContour
class ParticleVariations(object):
def __init__(self, particleContainer: ParticleContainer, numVariations: int = 10) -> None:
super(ParticleVariations, self).__init__()
self.origParticleContainer = particleContainer
self.numVariations = numVariations
def get_particleContainer_variations(self) -> ParticleContainer:
if self.numVariations > 0:
partContainer: ParticleContainer = self.origParticleContainer
contours: list = partContainer.getParticleContours()
center: tuple = round(np.mean(contours[:][0][0])),\
round(np.mean(contours[:][0][1]))
center: np.ndarray = np.array(center, dtype=np.int32)
angles = self._get_angles()
for i in range(self.numVariations):
if i > 0:
partContainer = copy.deepcopy(self.origParticleContainer)
for particle in partContainer.particles:
contour = np.int32(particle.contour)
particle.contour = rotateContour.rotate_contour_around_point(contour,
center, np.float(angles[i]))
yield partContainer
def _get_angles(self) -> np.ndarray:
angleIncrement: float = 360 / self.numVariations
return np.arange(self.numVariations) * angleIncrement
......@@ -6,16 +6,18 @@ Created on Wed Jan 22 13:57:28 2020
@author: luna
"""
import pickle
import sys
import os
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard import dataset
from helpers import ParticleBinSorter
import methods as meth
import geometricMethods as gmeth
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard import dataset
import chemometricMethods as cmeth
from datasetOperations import ParticleVariations
def get_name_from_directory(dirPath: str) -> str:
......@@ -23,9 +25,11 @@ def get_name_from_directory(dirPath: str) -> str:
class TotalResults(object):
methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
gmeth.SpiralBoxSubsampling]
measuredFreactions: list = [0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
# methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
# gmeth.SpiralBoxSubsampling, cmeth.ChemometricSubsampling]
# measuredFractions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
# measuredFractions: list = [0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions: list = [0.1, 0.3, 0.5, 0.9]
def __init__(self):
super(TotalResults, self).__init__()
......@@ -55,11 +59,12 @@ class TotalResults(object):
"""
for index, sample in enumerate(self.sampleResults):
sample.load_dataset()
for fraction in self.measuredFreactions:
possibleMethods = self._get_methods_for_fraction(sample.dataset, fraction)
for curMethod in possibleMethods:
# print(f'updating {sample.sampleName} with {curMethod.label} at fraction {fraction}')
sample.update_result_with_method(curMethod, force=force)
possibleMethods: list = []
for fraction in self.measuredFractions:
for method in self._get_methods_for_fraction(sample.dataset, fraction):
possibleMethods.append(method)
sample.update_result_with_methods(possibleMethods, force=force)
print(f'processed {index+1} of {len(self.sampleResults)} samples')
def get_error_vs_fraction_data(self, attributes: list = [], methods: list = []) -> dict:
......@@ -103,23 +108,99 @@ class TotalResults(object):
particleContainer = dataset.particleContainer
methods: list = [meth.RandomSampling(particleContainer, fraction),
meth.SizeBinFractioning(particleContainer, fraction)]
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
return methods
class SubsamplingResult(object):
"""
Stores all interesting results from a subsampling experiment
"""
# TODO: UPDATE PATTERNS -> ARE THESE REASONABLE???
mpPatterns = ['poly', 'rubber', 'pb', 'pr', 'pg', 'py', 'pv']
def __init__(self, subsamplingMethod: meth.SubsamplingMethod):
super(SubsamplingResult, self).__init__()
self.method: meth.SubsamplingMethod = subsamplingMethod
self.mpCountErrors: list = []
# self.origParticleCount: int = None
# self.subSampledParticleCount: int = None
# self.mpCountErrorPerBin: tuple = None
@property
def mpCountError(self) -> float:
error: float = 0.0
if len(self.mpCountErrors) > 0:
error = float(np.mean(self.mpCountErrors))
return error
def reset_results(self) -> None:
"""
Deletes all results
:return:
"""
self.mpCountErrors = []
def add_result(self, origParticles: list, subParticles: list) -> None:
"""
Takes the particle lists from a subsampling method and appends the calculated results.
:param origParticles:
:param subParticles:
:return:
"""
self.mpCountErrors.append(self._get_mp_count_error(origParticles, subParticles, self.method.fraction))
def _get_mp_count_error_per_bin(self, allParticles: list, subParticles: list, fractionMeasured: float) -> tuple:
binSorter = ParticleBinSorter()
allParticlesInBins = binSorter.sort_particles_into_bins(allParticles)
subParticlesInBins = binSorter.sort_particles_into_bins(subParticles)
mpCountErrorsPerBin = []
for allParticleBin, subParticleBin in zip(allParticlesInBins, subParticlesInBins):
mpCountErrorsPerBin.append(self._get_mp_count_error(allParticleBin, subParticleBin, fractionMeasured))
return binSorter.bins, mpCountErrorsPerBin
def _get_mp_count_error(self, allParticles: list, subParticles: list, fractionMeasured: float) -> float:
numMPOrig = self._get_number_of_MP_particles(allParticles)
numMPEstimate = self._get_number_of_MP_particles(subParticles) / fractionMeasured
if numMPOrig != 0:
mpCountError = self._get_error_from_values(numMPOrig, numMPEstimate)
elif numMPEstimate == 0:
mpCountError = 0
else:
raise Exception # >0 particles in subsample, whereas none in entire sample. This cannot be!
return mpCountError
def _get_error_from_values(self, exact: float, estimate: float) -> float:
assert (exact != 0)
return abs(exact - estimate) / exact * 100
def _get_number_of_MP_particles(self, particleList: list) -> int:
numMPParticles = 0
for particle in particleList:
assignment = particle.getParticleAssignment()
for pattern in self.mpPatterns:
if assignment.lower().find(pattern) != -1:
numMPParticles += 1
break
return numMPParticles
class SampleResult(object):
"""
An object the actually stores all generated results per sample and can update and report on them.
"""
def __init__(self, filepath: str):
def __init__(self, filepath: str, numVariations: int = 10):
super(SampleResult, self).__init__()
self.filepath: str = filepath
self.dataset: dataset.DataSet = None
self.results: list = []
self.attributes: list = []
self.numVariations: int = numVariations # how often the sample is altered for each method
@property
def sampleName(self) -> str:
......@@ -129,25 +210,47 @@ class SampleResult(object):
self.dataset = dataset.loadData(self.filepath)
assert self.dataset is not None
def update_result_with_method(self, method: meth.SubsamplingMethod, force: bool = False) -> None:
def update_result_with_methods(self, methods: list, force: bool = False) -> list:
"""
Updates result with the given method (contains desiredFraction already)
:param method: The SubsamplingMethod Object
:param force: Wether to force an update. If False, the result is not updated, if it is already present.
:return:
:return: list of updated methods
"""
if not self._result_is_already_present(method) or force:
if force:
self._remove_result_of_method(method)
if self.dataset is None:
self.load_dataset()
method.particleContainer = self.dataset.particleContainer
newResult: SubsamplingResult = SubsamplingResult(method)
self.results.append(newResult)
newResult.update()
if self.dataset is None and len(methods) > 0:
self.load_dataset()
updatedMethods: list = []
particleVariations: ParticleVariations = ParticleVariations(self.dataset.particleContainer,
numVariations=self.numVariations)
needsToBeUpdated: dict = {method: False for method in methods}
for index, particleContainer in enumerate(particleVariations.get_particleContainer_variations()):
for method in methods:
result: SubsamplingResult = self._get_result_of_method(method)
method: meth.SubsamplingMethod = method
method.particleContainer = particleContainer
if index == 0:
if result is None:
result = SubsamplingResult(method)
self.results.append(result)
result.reset_results()
needsToBeUpdated[method] = True
elif force:
result.reset_results()
needsToBeUpdated[method] = True
if needsToBeUpdated[method]:
subParticles = method.apply_subsampling_method()
result.add_result(method.particleContainer.particles, subParticles)
if method not in updatedMethods:
updatedMethods.append(method)
print(f'updated {self.sampleName} with {method.label} at fraction {method.fraction}, '
f'iteration {index+1}')
return updatedMethods
def set_attribute(self, newAttribute: str) -> None:
"""
......@@ -157,7 +260,6 @@ class SampleResult(object):
"""
if not self.has_attribute(newAttribute):
self.attributes.append(newAttribute)
print(f'sample {self.filepath} has now attribute {newAttribute}')
def has_any_attribute(self, listOfAttributes: list) -> bool:
hasAttr: bool = False
......@@ -181,85 +283,21 @@ class SampleResult(object):
if method.equals(result.method):
self.results.remove(result)
def _result_is_already_present(self, method: meth.SubsamplingMethod) -> bool:
def _get_result_of_method(self, method: meth.SubsamplingMethod) -> SubsamplingResult:
"""
Checks, if a result with the given method (method type AND measured fraction) is already present.
:param method: The method object, specifying the subsampling method and the measured fraction
:return:
"""
isPresent: bool = False
requestedResult: SubsamplingResult = None
for result in self.results:
if method.equals(result.method):
isPresent = True
requestedResult = result
break
return isPresent
return requestedResult
# def _get_result_of_method(self, method: meth.SubsamplingMethod) -> SubsamplingResult:
# return None
class SubsamplingResult(object):
"""
Stores all interesting results from a subsampling experiment
"""
def __init__(self, subsamplingMethod: meth.SubsamplingMethod):
super(SubsamplingResult, self).__init__()
self.method: meth.SubsamplingMethod = subsamplingMethod
self.fraction = self.method.fraction
self.origParticleCount: int = None
self.subSampledParticleCount: int = None
self.mpCountError: float = None
self.mpCountErrorPerBin: tuple = None
# TODO: UPDATE PATTERNS -> ARE THESE REASONABLE???
self.mpPatterns = ['poly', 'rubber', 'pb', 'pr', 'pg', 'py', 'pv']
def update(self) -> None:
"""
Updates all results from the method.
:return:
"""
assert self.method.particleContainer is not None
origParticles: list = self.method.particleContainer.particles
self.origParticleCount = len(origParticles)
subParticles: list = self.method.apply_subsampling_method()
self.subSampledParticleCount = len(subParticles)
fraction: float = self.method.fraction
self.mpCountError = self._get_mp_count_error(origParticles, subParticles, fraction)
# print(f'{self.origParticleCount} particles, thereof {self.subSampledParticleCount} measured, error: {self.mpCountError}')
self.mpCountErrorPerBin = self._get_mp_count_error_per_bin(origParticles, subParticles, fraction)
# print(f'method {self.method.label} updated, result is {self.mpCountError}')
def _get_mp_count_error_per_bin(self, allParticles: list, subParticles: list, fractionMeasured: float) -> tuple: