Commit 7ca2dd08 authored by Josef Brandt's avatar Josef Brandt

First Stage Trained Subsampling

parent ec57b131
......@@ -14,7 +14,9 @@ import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis import particleAndMeasurement as pm
from gepard.analysis import particleCharacterization as pc
from methods import SubsamplingMethod
from helpers import timingDecorator
......@@ -95,9 +97,9 @@ def get_solidity(contour: np.ndarray) -> float:
hull: np.ndarray = cv2.convexHull(contour)
hull_area: float = cv2.contourArea(hull)
if area == 0 or hull_area == 0:
raise ValueError
solidity: float = area / hull_area
solidity: float = 0
else:
solidity: float = area / hull_area
return solidity
......@@ -112,10 +114,11 @@ def get_aspect_ratio(contour: np.ndarray) -> float:
if short > long:
long, short = short, long
if short == 0.0:
raise InvalidParticleError
aspectRatio: float = 1.0
if short > 0.0:
aspectRatio = long/short
return long/short
return aspectRatio
def get_extent(contour: np.ndarray) -> float:
......@@ -163,11 +166,42 @@ class TrainedSubsampling(SubsamplingMethod):
self.score: float = None
self.clf = None
self.clfPath: str = path
self.fraction = desiredFraction
# @property
# def fraction(self) -> float:
# return self.desiredFraction/2
def equals(self, otherMethod) -> bool:
isEqual: bool = False
if type(otherMethod) == TrainedSubsampling and otherMethod.fraction == self.fraction:
if otherMethod.score == self.score and otherMethod.clf is self.clf:
isEqual = True
return isEqual
@property
def label(self) -> str:
return 'Trained Random Sampling'
def get_maximum_achievable_fraction(self) -> float:
return 1.0
def apply_subsampling_method(self) -> list:
self._load_classifier()
features: np.ndarray = get_particle_featurematrix(self.particleContainer)
predictions: np.ndarray = self.clf.predict(features)
indicesToSelect: set = self._get_measure_indices(list(predictions))
selectedParticles: list = []
for particle in self.particleContainer.particles:
if particle.index in indicesToSelect:
selectedParticles.append(particle)
return selectedParticles
# def _make_subparticles_match_fraction(self, subParticles: list) -> list:
# return subParticles
def _load_classifier(self) -> None:
assert os.path.exists(self.clfPath)
fname: str = self.clfPath
......@@ -195,6 +229,18 @@ class TrainedSubsampling(SubsamplingMethod):
return indicesToMeasure
def get_theoretic_frac(self) -> float:
"""
The theoretical fraction that considers also the scoring of the trained model.
It is used for extrapolating the mpCount of the subsampled particle list.
:return:
"""
score: float = self.score
diff: float = 1/self.fraction - 1 # i.e., from 50 % score to 100 % score
factor: float = 1 + (1 - score)/0.5 * diff
return 1 / factor
# return self.fraction
# class ChemometricSubsampling(SubsamplingMethod):
# # def __init__(self, particleContainer: ParticleContainer, desiredFraction: float):
......
......@@ -58,7 +58,7 @@ def test_classification_models(dataset: tuple) -> None:
if __name__ == '__main__':
recreateNew: bool = True
recreateNew: bool = False
if recreateNew:
pklsInFolders: dict = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
......@@ -110,6 +110,12 @@ if __name__ == '__main__':
dset: tuple = pickle.load(fp)
X, y = dset
with open(r'C:\Users\xbrjos\Desktop\Python\Subsampling\chemometrics\RandomForestClassifier, score 0.72.pkl', "rb") as fp:
clf: RandomForestClassifier = pickle.load(fp)
y_predicted = clf.predict(X)
# np.savetxt('Data.txt', X)
# np.savetxt('Assignments.txt', y)
# princComps = get_pca(X.transpose(), numComp=2)
......@@ -121,4 +127,4 @@ if __name__ == '__main__':
# print(X_equalized.shape)
test_classification_models((X, y))
# test_classification_models((X, y))
......@@ -32,8 +32,8 @@ def get_methods_to_test(dataset: dataset.DataSet, fractions: list = [], maxTries
:return: list of measurement Objects that are applicable
"""
if len(fractions) == 0:
fractions: list = [0.02, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]
# fractions: list = [0.02, 0.1, 0.5, 0.9]
# fractions: list = [0.02, 0.05, 0.1, 0.25, 0.5, 0.7, 0.9]
fractions: list = [0.1, 0.3, 0.5]
methods: list = []
particleContainer = dataset.particleContainer
......@@ -46,6 +46,7 @@ def get_methods_to_test(dataset: dataset.DataSet, fractions: list = [], maxTries
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_randomBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
methods += boxCreator.get_randomQuarterBoxSubsamplers_for_fraction(fraction, maxTries=maxTries)
methods.append(cmeth.TrainedSubsampling(particleContainer, fraction))
# methods.append(cmeth.ChemometricSubsampling(particleContainer, fraction))
return methods
......@@ -91,27 +92,34 @@ class TotalResults(object):
return newResult
def update_all(self, force: bool = False) -> None:
def update_all(self, force: bool = False, multiprocessing: bool = True) -> None:
"""
Updates all samples with all methods and all fractions
:param force: Wether to force an update of an already existing method.
:param force: Whether to force an update of an already existing method.
:param multiprocessing: Whether to spawn multiple processes for computation
:return:
"""
forceList: list = [force]*len(self.sampleResults)
indices: list = list(np.arange(len(self.sampleResults)))
numSamples: int = len(forceList)
numWorkers: int = 4 # in case of quadcore processor that seams reasonable??
chunksize: int = int(round(numSamples / numWorkers * 0.7)) # we want to have slightly more chunks than workers
print(f'multiprocessing with {numSamples} samples and chunksize of {chunksize}')
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(update_sample, self.sampleResults, forceList, indices, chunksize=chunksize)
for index, res in enumerate(results):
updatedSample, processid = res
print(f'returned from process {processid}, iteration index {index}')
self.sampleResults[index] = updatedSample
if multiprocessing:
forceList: list = [force]*len(self.sampleResults)
indices: list = list(np.arange(len(self.sampleResults)))
numSamples: int = len(forceList)
numWorkers: int = 4 # in case of quadcore processor that seams reasonable??
chunksize: int = int(round(numSamples / numWorkers * 0.7)) # we want to have slightly more chunks than workers
print(f'multiprocessing with {numSamples} samples and chunksize of {chunksize}')
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(update_sample, self.sampleResults, forceList, indices, chunksize=chunksize)
for index, res in enumerate(results):
updatedSample, processid = res
print(f'returned from process {processid}, iteration index {index}')
self.sampleResults[index] = updatedSample
else:
for index, sampleResult in enumerate(self.sampleResults):
updatedResult, i = update_sample(sampleResult, True, index)
self.sampleResults[index] = updatedResult
print(f'done updating {updatedResult.dataset.name} at index {index}')
def get_error_vs_fraction_data(self, attributes: list = [], methods: list = []) -> dict:
"""
......@@ -200,7 +208,12 @@ class SubsamplingResult(object):
:param subParticles:
:return:
"""
error: float = self._get_mp_count_error(origParticles, subParticles, self.method.fraction)
if type(self.method) == cmeth.TrainedSubsampling:
fraction = self.method.get_theoretic_frac()
else:
fraction = self.method.fraction
error: float = self._get_mp_count_error(origParticles, subParticles, fraction)
self.origParticleCount = len(origParticles)
self.mpCountErrors.append(error)
......
......@@ -2,7 +2,7 @@ import os
import pickle
from evaluation import TotalResults
from helpers import timingDecorator
from chemometrics.chemometricMethods import TrainedSubsampling
def load_results(fname: str) -> TotalResults:
res: TotalResults = None
......@@ -17,6 +17,10 @@ def save_results(fname: str, result: TotalResults) -> None:
for sampleRes in result.sampleResults:
storedDsets[sampleRes.sampleName] = sampleRes.dataset
sampleRes.dataset = None
for subsamplingRes in sampleRes.results:
subsamplingRes.method.particleContainer = None
if type(subsamplingRes.method) == TrainedSubsampling:
subsamplingRes.method.clf = None
with open(fname, "wb") as fp:
pickle.dump(result, fp, protocol=-1)
......
......@@ -18,6 +18,15 @@ class SubsamplingMethod(object):
self.particleContainer = particleConatainer
self.fraction: float = desiredFraction
# @property
# def fraction(self) -> float:
# """
# The TrainedSubsampling, e.g., changes its fraction depending on the quality of its training.
# All "regular" methods just return the desired Fraction.
# :return:
# """
# return self.desiredFraction
@property
def label(self) -> str:
"""
......
......@@ -12,24 +12,26 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH
"""
if __name__ == '__main__':
# results: TotalResults = TotalResults()
# pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
#
# for folder in pklsInFolders.keys():
# for samplePath in pklsInFolders[folder]:
# newSampleResult: SampleResult = results.add_sample(samplePath)
# for attr in get_attributes_from_foldername(folder):
# newSampleResult.set_attribute(attr)
#
# t0 = time.time()
# results.update_all()
# print('updating all took', time.time()-t0, 'seconds')
#
# save_results('results2_without_rot.res', results)
results: TotalResults = load_results('results2.res')
results: TotalResults = TotalResults()
pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
counter = 0
for folder in pklsInFolders.keys():
for samplePath in pklsInFolders[folder]:
if counter < 10:
newSampleResult: SampleResult = results.add_sample(samplePath)
for attr in get_attributes_from_foldername(folder):
newSampleResult.set_attribute(attr)
counter += 1
t0 = time.time()
results.update_all(multiprocessing=False)
print('updating all took', time.time()-t0, 'seconds')
save_results('results_test.res', results)
# results: TotalResults = load_results('results2.res')
plot: Figure = get_error_vs_frac_plot(results, attributes=[[]],
methods=[['random subs', 'sizebin', '5 boxes', '15']], standarddevs=False)
methods=[['random', 'trained']], standarddevs=True)
# plot: Figure = get_error_vs_frac_plot(results, attributes=[['air', 'water'], ['sediment', 'soil', 'beach', 'slush']],
# methods=[['random layout (7', 'random layout (1']]*2)
# methods=[[]]*2)
......
import numpy as np
import random
import sys
sys.path.append("C://Users//xbrjos//Desktop//Python")
import gepard
from gepard.dataset import DataSet
from gepard.analysis.particleContainer import ParticleContainer
from gepard.analysis.particleAndMeasurement import Particle, Measurement
def setMaxDim(dataset: DataSet, imgSize: float, minX: float, maxX: float, minY: float, maxY: float) -> None:
......@@ -34,3 +36,39 @@ def get_default_ParticleContainer() -> ParticleContainer:
return particleContainer
def get_MP_particles(numParticles) -> list:
mpParticles = []
for _ in range(numParticles):
mpParticles.append(get_MP_particle())
return mpParticles
def get_non_MP_particles(numParticles) -> list:
nonMPParticles = []
for _ in range(numParticles):
nonMPParticles.append(get_non_MP_particle())
return nonMPParticles
def get_MP_particle() -> Particle:
random.seed(15203018)
polymerNames = ['Poly (methyl methacrylate',
'Polyethylene',
'Silicone rubber',
'PB15',
'PY13',
'PR20']
polymName = random.sample(polymerNames, 1)[0]
newParticle: Particle = Particle()
newMeas = Measurement()
newMeas.setAssignment(polymName)
newParticle.addMeasurement(newMeas)
return newParticle
def get_non_MP_particle() -> Particle:
newParticle: Particle = Particle()
newParticle.addMeasurement(Measurement())
return newParticle
......@@ -16,7 +16,9 @@ from gepard.analysis.particleContainer import ParticleContainer
from gepard import dataset
from chemometrics import chemometricMethods as cmeth
from helpers_for_test import get_default_ParticleContainer
from helpers_for_test import get_default_ParticleContainer, get_non_MP_particles, get_MP_particles
from evaluation import SubsamplingResult
class TestParticleFeatures(unittest.TestCase):
def test_get_contour_moments(self):
......@@ -93,8 +95,6 @@ class TestTrainedSubsampling(unittest.TestCase):
self.assertEqual(self.trainedSampling.score, 0.7)
def test_get_measure_indices(self):
import time
t0 = time.time()
for mpFrac in [0.001, 0.01, 0.05]:
for numMPParticles in [1, 10, 100]:
numNonMP: int = int(numMPParticles * 1/mpFrac) - numMPParticles
......@@ -114,6 +114,40 @@ class TestTrainedSubsampling(unittest.TestCase):
for index in range(numMPParticles): # all MP Particles should be measured
self.assertTrue(index in indicesToMeasure)
def test_get_theoretic_fraction(self):
for frac in [0.1, 0.3, 0.5, 0.9, 1.0]:
for score in [0.5, 0.7, 1.0]:
self.trainedSampling.fraction = frac
self.trainedSampling.score = score
score: float = self.trainedSampling.score
diff: float = 1 / self.trainedSampling.fraction - 1 # i.e., from 50 % score to 100 % score
factor: float = 1 + (1 - score) / 0.5 * diff
self.assertEqual(self.trainedSampling.get_theoretic_frac(), 1/factor)
# def test_make_subparticles_match_fraction(self):
# self.trainedSampling.desiredFraction = 0.5
# result: SubsamplingResult = SubsamplingResult(self.trainedSampling)
#
# allParticles: list = get_MP_particles(10) + get_non_MP_particles(990)
# subParticles: list = get_MP_particles(10) + get_non_MP_particles(490) # half of particles but ALL mp particles
# self.trainedSampling.particleContainer.particles = allParticles + subParticles
#
# self.trainedSampling.score = 1.0 # i.e., perfect prediction
# # modSubParticles: list = self.trainedSampling._make_subparticles_match_fraction(subParticles)
# result.add_result(subParticles, allParticles)
# self.assertEqual(result.mpCountError, 0)
#
# self.trainedSampling.score = 0.5 # i.e., completely random, no prediction quality
# # modSubParticles: list = self.trainedSampling._make_subparticles_match_fraction(subParticles)
# result.add_result(subParticles, allParticles)
# self.assertEqual(result.mpCountError, 100)
# class TestChemometricSubsampling(unittest.TestCase):
# def setUp(self) -> None:
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment