Commit d7ed0e0f authored by Josef Brandt's avatar Josef Brandt

Merge branch 'master' into Development

parents 68561321 8074c9d8
......@@ -14,3 +14,9 @@ cythonModules/build/
*.pyd
*.html
*.pkl
chemometrics/Assignments.txt
chemometrics/Data.txt
import numpy as np
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from scipy import spatial
from itertools import combinations
from random import sample
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from methods import SubsamplingMethod
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
try:
standardizedData = StandardScaler().fit_transform(data.copy())
except ValueError:
print('first standardscaler attempt failed, retrying..')
print('datashape', data.shape)
print('unique:', np.unique(data))
raise
pca = PCA(n_components=numComp)
princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
return princComp
def do_DBSCAN_clustering(data: np.ndarray, eps: float = 0.1, min_samples: int = 10) -> tuple:
"""
Does DBSCAN clustering and finds noisy data
:param data: The input array
:param eps:
:param min_samples:
:return: Cluster labels for each point in the dataset given to fit(). Noisy samples are given the label -1.
"""
assert data.shape[1] == 2
standardizedData = StandardScaler().fit_transform(data)
db = DBSCAN(eps=eps, min_samples=min_samples).fit(standardizedData)
return db.labels_, db.core_sample_indices_
def get_n_points_closest_to_point(points: np.ndarray, n: int, refPoint: np.ndarray) -> list:
"""
Returns a list with indices of n points that are closest to the indicated refPoint
:param points: np.ndarray, cols: x, y, rows: individual points
:param n: number of points to return
:param refPoint: np.array([x, y]) of reference point
:return: list of point indices
"""
distancesToPoints: np.ndarray = np.linalg.norm(points - refPoint, axis=1)
sortedIndices = np.argsort(distancesToPoints)
return list(sortedIndices[:n])
class ChemometricSubsampling(SubsamplingMethod):
def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
super(ChemometricSubsampling, self).__init__(particleContainer, desiredFraction)
@property
def label(self) -> str:
return 'Chemometric Selection'
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
try:
princComps: np.ndarray = get_pca(vectors)
except ValueError:
print('numParticles:', len(self.particleContainer.particles))
print('input featurematrix shape', vectors.shape)
clusterLabels, coreIndices = do_DBSCAN_clustering(princComps)
indices: list = self._get_indices_from_clusterLabels(princComps, clusterLabels, coreIndices)
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in indices:
selectedParticles.append(particle)
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray:
"""
:return: np.ndarray, numRows: Features, numCols: Particles
"""
vectors: list = []
for particle in self.particleContainer.particles:
extractor: FeatureExtractor = FeatureExtractor(particle)
vectors.append(extractor.get_characteristic_vector())
vectors: np.ndarray = np.transpose(np.array(vectors))
assert vectors.shape == (11, len(self.particleContainer.particles)), f'wrong featureMat-shape: {vectors.shape}'
return vectors
def equals(self, otherMethod) -> bool:
equals: bool = False
if type(otherMethod) == ChemometricSubsampling and otherMethod.fraction == self.fraction:
equals = True
return equals
def _get_indices_from_clusterLabels(self, points: np.ndarray, labels: np.ndarray, centerIndices: np.ndarray) -> list:
indices: list = []
allIndices: np.ndarray = np.arange(len(labels))
numPointsPerCluster: dict = self._get_numPoints_per_cluster(labels)
for clusterIndex in set(labels):
indToAppend: list = []
nPoints: int = int(numPointsPerCluster[clusterIndex])
indicesInCluster: np.ndarray = allIndices[labels == clusterIndex]
if clusterIndex == -1:
for ind in sample(list(indicesInCluster), nPoints):
# assert ind not in indices
indices.append(ind)
else:
clusterPoints: np.ndarray = points[indicesInCluster]
centerPoint: np.ndarray = np.mean(clusterPoints, axis=0)
indicesToSelect: list = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indicesToSelect:
origInd = indicesInCluster[ind]
indices.append(origInd)
assert len(set(indices)) == len(indices), f'The calculated indices contain duplicates, ' \
f'num duplicates: {len(indices) - len(set(indices))}'
return indices
def _get_numPoints_per_cluster(self, labels: np.ndarray, noiseAmpFactor: float = 5) -> dict:
"""
MP Particles are expected to be the minority of all particles. So, if datapoints were classified as noise
(i.e., label = -1), it is likely that MP is in there. The abundancy of points taken from the noise is multiplied
by the noiseAmpFactor
:param labels:
:param noiseAmpFactor:
:return: A dictionary with keys = cluster index (i.e., label) and value = number of points to take from that
"""
pointsPerCluster: dict = {}
if type(labels) != np.ndarray:
labels = np.array(labels)
individualLabels: set = set(labels)
numPointsToSelect = round(len(labels) * self.fraction)
if numPointsToSelect == 0:
numPointsToSelect = 1
numNoisePoints = len(labels[labels == -1])
numClusteredPoints = len(labels) - numNoisePoints
# # get max noiseAmpFactor
if noiseAmpFactor > 1/self.fraction:
noiseAmpFactor = 1/self.fraction
numAmpPoints = numClusteredPoints + numNoisePoints*noiseAmpFactor
fractionPerCluster = np.clip(numPointsToSelect / numAmpPoints, 0.0, 1.0)
tooFewPoints = numPointsToSelect < len(individualLabels)
totalPointsAdded = 0
for ind in individualLabels:
if ind > -1:
if not tooFewPoints:
pointsToAdd = round(fractionPerCluster * len(labels[labels == ind]))
else:
pointsToAdd = 1 if totalPointsAdded < numPointsToSelect else 0
pointsPerCluster[ind] = pointsToAdd
totalPointsAdded += pointsToAdd
# fill up the rest with noisePoints
if numNoisePoints > 0:
diff: float = np.clip(numPointsToSelect - totalPointsAdded, 0, numNoisePoints)
pointsPerCluster[-1] = diff
totalPointsAdded += diff
# just in case too many points were selected (due to rounding errors), keep on deleting until it matches
while totalPointsAdded > numPointsToSelect:
indexWithHighestCount = None
maxCount = 0
for index in pointsPerCluster.values():
if pointsPerCluster[index] > maxCount:
maxCount = pointsPerCluster[index]
indexWithHighestCount = index
pointsPerCluster[indexWithHighestCount] -= 1
totalPointsAdded -= 1
if not abs(totalPointsAdded - numPointsToSelect) <= 1:
print('error')
assert abs(totalPointsAdded - numPointsToSelect) <= 1
for clusterIndex in pointsPerCluster.keys():
assert 0 <= pointsPerCluster[clusterIndex] <= len(labels[labels == clusterIndex])
return pointsPerCluster
class FeatureExtractor(object):
def __init__(self, particle: pm.Particle):
super(FeatureExtractor, self).__init__()
self.particle: pm.Particle = particle
def get_characteristic_vector(self) -> np.ndarray:
log_hu: np.ndarray = self._get_log_hu_moments()
color: np.ndarray = self._get_color_hash(self.particle.color, desiredLength=4)
vector: np.ndarray = np.hstack((log_hu, color))
if len(vector) != 11:
print('error')
assert len(vector) == 7 + 4, f'wrong feature vector: {vector} with shape: {vector.shape}'
return vector
def _get_log_hu_moments(self) -> np.ndarray:
moments: dict = cv2.moments(self.particle.contour)
resultMoments: np.ndarray = np.zeros((7, 1))
for index, mom in enumerate(cv2.HuMoments(moments)):
if mom != 0:
resultMoments[index] = -1 * np.copysign(1.0, mom) * np.log10(abs(mom))
else:
resultMoments[index] = 0
return resultMoments[:, 0]
def _get_color_hash(self, color: str, desiredLength: int = 4) -> np.ndarray:
colorArray: list = [int(i) for i in str(abs(hash(color)))[:desiredLength]]
return np.transpose(np.array(colorArray))
This diff is collapsed.
import matplotlib.pyplot as plt
import numpy as np
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import pickle
import time
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard import dataset
from gepard.analysis.particleContainer import ParticleContainer
from input_output import get_pkls_from_directory
from chemometricMethods import get_log_hu_moments, get_color_index, get_pca, get_characteristic_vector
from evaluation import is_MP_particle
def test_classification_models(dataset: tuple) -> None:
names = ["RandomForestClassifier", "NeuralNetClassifier"]
classifiers = [
RandomForestClassifier(n_estimators=1000),
MLPClassifier(alpha=1, max_iter=1000)]
t0 = time.time()
# preprocess dataset, split into training and test part
X, y = dataset
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=.3, random_state=42)
print(f'prepocessng finished after {round(time.time()-t0, 2)} seconds')
# iterate over classifiers
for name, clf in zip(names, classifiers):
t0 = time.time()
clf.fit(X_train, y_train)
print(f'fitting {name} took {round(time.time()-t0, 2)} seconds')
t0 = time.time()
score = clf.score(X_test, y_test)
with open(f'{name}, score {round(score, 2)}.pkl', "wb") as fp:
pickle.dump(clf, fp, protocol=-1)
y_predicted = clf.predict(X_test)
print(f'finished getting score and prediction after {round(time.time() - t0, 2)} seconds')
errors: dict = {int(k): 0 for k in np.unique(y_test)}
for j in range(len(y_predicted)):
if y_test[j] != y_predicted[j]:
errors[y_test[j]] += 1
print(f'{name} with test size {len(y_test)} has score {round(score, 2)}, errors: {errors}')
if __name__ == '__main__':
recreateNew: bool = True
if recreateNew:
pklsInFolders: dict = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
X: list = []
y: list = []
counter = 0
for folder in pklsInFolders.keys():
for pklPath in pklsInFolders[folder]:
if counter < 100:
dset: dataset.DataSet = dataset.loadData(pklPath)
print('loaded', dset.name)
partContainer: ParticleContainer = dset.particleContainer
for particle in partContainer.particles:
features: np.ndarray = get_characteristic_vector(particle)
# features: list = [abs(i) for i in get_log_hu_moments(particle.contour)]
# features.append(get_color_index(particle.color))
X.append(features)
y.append(int(is_MP_particle(particle)))
counter += 1
X: np.ndarray = np.array(X)
y: np.ndarray = np.array(y)
MPindices: np.ndarray = np.where(y == 1)[0]
nonMPindices: np.ndarray = np.where(y == 0)[0]
nonMPindices: list = sample(list(nonMPindices), len(MPindices))
X_MP: list = list(X[MPindices])
y_MP: list = list(y[MPindices])
X_nonMP: list = list(X[nonMPindices])
y_nonMP: list = list(y[nonMPindices])
assert set(y_MP) == {1}
assert set(y_nonMP) == {0}
assert len(X_MP) == len(X_nonMP) == len(y_MP) == len(y_nonMP)
X_equalized: np.ndarray = np.array(X_MP + X_nonMP)
y_equalized: np.ndarray = np.array(y_MP + y_nonMP)
dset: tuple = (X_equalized, y_equalized)
with open('particleClassificaion.pkl', "wb") as fp:
pickle.dump(dset, fp, protocol=-1)
else:
with open('particleClassificaion.pkl', "rb") as fp:
dset: tuple = pickle.load(fp)
X, y = dset
# np.savetxt('Data.txt', X)
# np.savetxt('Assignments.txt', y)
# princComps = get_pca(X.transpose(), numComp=2)
#
# plt.scatter(princComps[:, 0], princComps[:, 1])
# print(X_equalized.shape)
# X: np.ndarray = SelectKBest(chi2, k=5).fit_transform(X, y)
# print(X_equalized.shape)
test_classification_models((X, y))
......@@ -18,7 +18,7 @@ from gepard.analysis.particleAndMeasurement import Particle
from helpers import ParticleBinSorter
import methods as meth
import geometricMethods as gmeth
import chemometricMethods as cmeth
from chemometrics import chemometricMethods as cmeth
from datasetOperations import ParticleVariations
......@@ -26,7 +26,7 @@ def get_name_from_directory(dirPath: str) -> str:
return str(os.path.basename(dirPath).split('.')[0])
def get_methods_to_test(dataset: dataset.DataSet, fractions: list = []) -> list:
def get_methods_to_test(dataset: dataset.DataSet, fractions: list = [], maxTries: int = 100) -> list:
"""
:param fraction: The desired fraction to measure
:return: list of measurement Objects that are applicable
......@@ -44,8 +44,8 @@ def get_methods_to_test(dataset: dataset.DataSet, fractions: list = []) -> list:
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_randomBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_randomQuarterBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_randomBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
methods += boxCreator.get_randomQuarterBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
# methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
return methods
......
......@@ -169,11 +169,12 @@ class BoxSelectionCreator(object):
return spiralBoxSubsamplers
def get_randomBoxSubsamplers_for_fraction(self, desiredFraction: float) -> list:
def get_randomBoxSubsamplers_for_fraction(self, desiredFraction: float, maxTries: int = 100) -> list:
randomBoxSamplers: list = []
diameter, offset = self._get_diameter_and_offset()
randomBoxSampler: RandomBoxSampling = RandomBoxSampling(None, desiredFraction)
randomBoxSampler.maxTries = maxTries
randomBoxSampler.update_max_fractions()
for numBoxes in randomBoxSampler.possibleBoxNumbers:
randomBoxSampler.numBoxes = numBoxes
......@@ -186,10 +187,11 @@ class BoxSelectionCreator(object):
return randomBoxSamplers
def get_randomQuarterBoxSubsamplers_for_fraction(self, desiredFraction: float) -> list:
def get_randomQuarterBoxSubsamplers_for_fraction(self, desiredFraction: float, maxTries: int = 100) -> list:
randomBoxSamplers: list = []
diameter, offset = self._get_diameter_and_offset()
randomBoxSampler: RandomQuarterBoxes = RandomQuarterBoxes(None, desiredFraction)
randomBoxSampler.maxTries = maxTries
randomBoxSampler.update_max_fractions()
for numBoxes in randomBoxSampler.possibleBoxNumbers:
......
......@@ -12,27 +12,27 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH
"""
if __name__ == '__main__':
results: TotalResults = TotalResults()
pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
for folder in pklsInFolders.keys():
for samplePath in pklsInFolders[folder]:
newSampleResult: SampleResult = results.add_sample(samplePath)
for attr in get_attributes_from_foldername(folder):
newSampleResult.set_attribute(attr)
t0 = time.time()
results.update_all()
print('updating all took', time.time()-t0, 'seconds')
save_results('results2.res', results)
# results: TotalResults = load_results('results1.res')
# results: TotalResults = TotalResults()
# pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
#
# for folder in pklsInFolders.keys():
# for samplePath in pklsInFolders[folder]:
# newSampleResult: SampleResult = results.add_sample(samplePath)
# for attr in get_attributes_from_foldername(folder):
# newSampleResult.set_attribute(attr)
#
# t0 = time.time()
# results.update_all()
# print('updating all took', time.time()-t0, 'seconds')
#
# save_results('results2_without_rot.res', results)
results: TotalResults = load_results('results2.res')
plot: Figure = get_error_vs_frac_plot(results, attributes=[[]],
methods=[['random subs', 'sizebin', '5 boxes', '15']], standarddevs=False)
# plot: Figure = get_error_vs_frac_plot(results, attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']],
# methods=[['Boxes random']]*2)
# methods=[['random layout (7', 'random layout (1']]*2)
# methods=[[]]*2)
# methods=[['Random Subsampling', 'Sizebin']] * 2)
plot: Figure = get_error_vs_frac_plot(results,
attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']],
methods=[['layout (7', 'layout (10', 'layout (15', 'cross', 'random subsampling', 'sizebin']] * 2)
# methods=[['layout (7', 'layout (10', 'layout (15', 'cross', 'random subsampling', 'sizebin']] * 2)
plot.show()
......@@ -27,5 +27,10 @@ def get_default_ParticleContainer() -> ParticleContainer:
x = 10*i
contours.append(np.array([[[x, 0]], [[x+10, 0]], [[x+10, 10]], [[x, 10]]], dtype=np.int32))
particleContainer.setParticleContours(contours)
particleContainer.particles[0].color = 'red'
particleContainer.particles[1].color = 'blue'
particleContainer.particles[2].color = 'green'
particleContainer.particles[3].color = 'transparent'
return particleContainer
This diff is collapsed.
......@@ -254,7 +254,7 @@ class TestSampleResult(unittest.TestCase):
dset.maxdim = minX + imgdim / 2, maxY - imgdim / 2, maxX - imgdim / 2, minY + imgdim / 2
desiredFraction = 0.1
methods = get_methods_to_test(dset, [desiredFraction])
methods = get_methods_to_test(dset, [desiredFraction], maxTries=10)
possibleRandomMethods = 2
possibleCrossBoxMethods = 2
possibleSpiralBoxMethods = 3
......@@ -273,7 +273,7 @@ class TestSampleResult(unittest.TestCase):
self.assertTrue(containsMethod(methods, gmeth.RandomBoxSampling(dset, desiredFraction)))
desiredFraction = 0.5
methods = get_methods_to_test(dset, [desiredFraction])
methods = get_methods_to_test(dset, [desiredFraction], maxTries=10)
possibleRandomMethods = 2
possibleCrossBoxMethods = 1
possibleSpiralBoxMethods = 0
......@@ -290,7 +290,7 @@ class TestSampleResult(unittest.TestCase):
self.assertFalse(containsMethod(methods, gmeth.SpiralBoxSubsampling(dset, desiredFraction)))
desiredFraction = 0.9
methods = get_methods_to_test(dset, [desiredFraction])
methods = get_methods_to_test(dset, [desiredFraction], maxTries=10)
possibleRandomMethods = 2
possibleCrossBoxMethods = 0
possibleSpiralBoxMethods = 0
......@@ -307,7 +307,7 @@ class TestSampleResult(unittest.TestCase):
self.assertFalse(containsMethod(methods, gmeth.SpiralBoxSubsampling(dset, desiredFraction)))
desiredFractions = [0.1, 0.5]
methods = get_methods_to_test(dset, desiredFractions)
methods = get_methods_to_test(dset, desiredFractions, maxTries=10)
possibleRandomMethods = 4
possibleCrossBoxMethods = 3
possibleSpiralBoxMethods = 3
......
......@@ -320,6 +320,7 @@ class TestBoxCreator(unittest.TestCase):
def test_get_randBoxSubsampler_for_Fraction(self):
randBoxSampler: RandomBoxSampling = RandomBoxSampling(None)
randBoxSampler.maxTries = 10
randBoxSampler.update_max_fractions()
maxFracs: dict = randBoxSampler.maxFractions
......@@ -331,7 +332,7 @@ class TestBoxCreator(unittest.TestCase):
numValid += 1
validNumBoxes.append(numBoxes)
possibleMehotds: list = self.boxCreator.get_randomBoxSubsamplers_for_fraction(frac)
possibleMehotds: list = self.boxCreator.get_randomBoxSubsamplers_for_fraction(frac, maxTries=10)
self.assertEqual(len(possibleMehotds), numValid)
self._assert_correct_partCont(possibleMehotds)
for meth in possibleMehotds:
......
......@@ -15,7 +15,7 @@ from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis.particleAndMeasurement import Particle
from methods import SubsamplingMethod, RandomSampling, SizeBinFractioning
import geometricMethods as gmeth
import chemometricMethods as cmeth
from chemometrics import chemometricMethods as cmeth
from helpers import ParticleBinSorter
......@@ -28,12 +28,13 @@ def get_default_particle_container(numParticles=1000):
class TestAllMethodsGeneric(unittest.TestCase):
allMethodClasses : list = [RandomSampling, SizeBinFractioning,
gmeth.CrossBoxSubSampling, gmeth.SpiralBoxSubsampling,
cmeth.ChemometricSubsampling]
cmeth.TrainedSubsampling]
def setUp(self) -> None:
partContainer: ParticleContainer = get_default_particle_container()
self.methods = []
for methClass in self.allMethodClasses:
self.methods.append(methClass(None, 0.1))
self.methods.append(methClass(partContainer, 0.1))
def test_basic_methods(self):
for method in self.methods:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment