Commit 5a65802d authored by Josef Brandt's avatar Josef Brandt

Bugfix in DBSCAN

parent 66624cab
......@@ -17,10 +17,12 @@ from methods import SubsamplingMethod
def get_pca(data: np.ndarray, numComp: int = 2) -> np.ndarray:
try:
standardizedData = StandardScaler().fit_transform(data)
standardizedData = StandardScaler().fit_transform(data.copy())
except ValueError:
print('first standardscaler attempt failed, retrying..')
standardizedData = StandardScaler().fit_transform(data)
print('datashape', data.shape)
print('unique:', np.unique(data))
raise
pca = PCA(n_components=numComp)
princComp: np.ndarray = pca.fit_transform(np.transpose(standardizedData))
return princComp
......@@ -63,7 +65,11 @@ class ChemometricSubsampling(SubsamplingMethod):
def apply_subsampling_method(self) -> list:
vectors: np.ndarray = self._get_particle_featurematrix()
princComps: np.ndarray = get_pca(vectors)
try:
princComps: np.ndarray = get_pca(vectors)
except ValueError:
print('numParticles:', len(self.particleContainer.particles))
print('input featurematrix shape', vectors.shape)
clusterLabels, coreIndices = do_DBSCAN_clustering(princComps)
indices: list = self._get_indices_from_clusterLabels(princComps, clusterLabels, coreIndices)
......@@ -75,11 +81,15 @@ class ChemometricSubsampling(SubsamplingMethod):
return selectedParticles
def _get_particle_featurematrix(self) -> np.ndarray:
"""
:return: np.ndarray, numRows: Features, numCols: Particles
"""
vectors: list = []
for particle in self.particleContainer.particles:
extractor: FeatureExtractor = FeatureExtractor(particle)
vectors.append(extractor.get_characteristic_vector())
vectors: np.ndarray = np.transpose(np.array(vectors))
assert vectors.shape == (11, len(self.particleContainer.particles)), f'wrong featureMat-shape: {vectors.shape}'
return vectors
def equals(self, otherMethod) -> bool:
......@@ -98,15 +108,19 @@ class ChemometricSubsampling(SubsamplingMethod):
nPoints: int = int(numPointsPerCluster[clusterIndex])
indicesInCluster: np.ndarray = allIndices[labels == clusterIndex]
if clusterIndex == -1:
indToAppend = sample(list(indicesInCluster), nPoints)
for ind in sample(list(indicesInCluster), nPoints):
# assert ind not in indices
indices.append(ind)
else:
clusterPoints: np.ndarray = points[indicesInCluster]
centerPoint: np.ndarray = np.mean(clusterPoints, axis=0)
indToAppend = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indToAppend:
indices.append(ind)
indicesToSelect: list = get_n_points_closest_to_point(clusterPoints, nPoints, centerPoint)
for ind in indicesToSelect:
origInd = indicesInCluster[ind]
indices.append(origInd)
assert len(set(indices)) == len(indices), f'The calculated indices contain duplicates, ' \
f'num duplicates: {len(indices) - len(set(indices))}'
return indices
def _get_numPoints_per_cluster(self, labels: np.ndarray, noiseAmpFactor: float = 5) -> dict:
......@@ -169,7 +183,9 @@ class ChemometricSubsampling(SubsamplingMethod):
pointsPerCluster[indexWithHighestCount] -= 1
totalPointsAdded -= 1
assert abs(totalPointsAdded - numPointsToSelect) <= 1
if not abs(totalPointsAdded - numPointsToSelect) <= 1:
print('error')
# assert abs(totalPointsAdded - numPointsToSelect) <= 1
for clusterIndex in pointsPerCluster.keys():
assert 0 <= pointsPerCluster[clusterIndex] <= len(labels[labels == clusterIndex])
return pointsPerCluster
......@@ -182,8 +198,12 @@ class FeatureExtractor(object):
def get_characteristic_vector(self) -> np.ndarray:
log_hu: np.ndarray = self._get_log_hu_moments()
color: np.ndarray = self._get_color_hash(self.particle.color)
return np.transpose(np.hstack((log_hu, color)))
color: np.ndarray = self._get_color_hash(self.particle.color, desiredLength=4)
vector: np.ndarray = np.hstack((log_hu, color))
if len(vector) != 11:
print('error')
assert len(vector) == 7 + 4, f'wrong feature vector: {vector} with shape: {vector.shape}'
return vector
def _get_log_hu_moments(self) -> np.ndarray:
moments: dict = cv2.moments(self.particle.contour)
......@@ -197,5 +217,5 @@ class FeatureExtractor(object):
return resultMoments[:, 0]
def _get_color_hash(self, color: str, desiredLength: int = 4) -> np.ndarray:
colorArray: list = [int(i) for i in str(abs(hash(color)) % (10**desiredLength))]
colorArray: list = [int(i) for i in str(abs(hash(color)))[:desiredLength]]
return np.transpose(np.array(colorArray))
......@@ -26,9 +26,9 @@ def get_name_from_directory(dirPath: str) -> str:
class TotalResults(object):
# methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
# gmeth.SpiralBoxSubsampling, cmeth.ChemometricSubsampling]
# measuredFractions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
# measuredFractions: list = [0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions: list = [0.1, 0.3, 0.5, 0.7, 0.9]
# measuredFractions: list = [0.1, 0.3, 0.5, 0.9]
def __init__(self):
super(TotalResults, self).__init__()
......@@ -106,7 +106,6 @@ class TotalResults(object):
particleContainer = dataset.particleContainer
methods: list = [meth.RandomSampling(particleContainer, fraction),
meth.SizeBinFractioning(particleContainer, fraction)]
boxCreator: gmeth.BoxSelectionCreator = gmeth.BoxSelectionCreator(dataset)
methods += boxCreator.get_crossBoxSubsamplers_for_fraction(fraction)
methods += boxCreator.get_spiralBoxSubsamplers_for_fraction(fraction)
......
......@@ -10,23 +10,23 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH
"""
# results: TotalResults = TotalResults()
# pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
#
# for folder in pklsInFolders.keys():
# for samplePath in pklsInFolders[folder]:
# newSampleResult: SampleResult = results.add_sample(samplePath)
# for attr in get_attributes_from_foldername(folder):
# newSampleResult.set_attribute(attr)
#
# t0 = time.time()
# results.update_all()
# print('updating all took', time.time()-t0, 'seconds')
#
# save_results('results1.res', results)
results: TotalResults = load_results('results1.res')
results.update_all(force=True)
results: TotalResults = TotalResults()
pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
for folder in pklsInFolders.keys():
for samplePath in pklsInFolders[folder]:
newSampleResult: SampleResult = results.add_sample(samplePath)
for attr in get_attributes_from_foldername(folder):
newSampleResult.set_attribute(attr)
t0 = time.time()
results.update_all()
print('updating all took', time.time()-t0, 'seconds')
save_results('results1.res', results)
# results: TotalResults = load_results('results1.res')
# results.update_all(force=True)
# save_results('results1.res', results)
plt.clf()
errorPerFraction: dict = results.get_error_vs_fraction_data(attributes=['air', 'water'],
......@@ -39,11 +39,11 @@ for methodLabel in errorPerFraction.keys():
plt.plot(fractions, errors)
plt.scatter(fractions, errors, label=methodLabel)
plt.title('Spiral or Box Layouts on Air/Water sample', fontSize=15)
plt.title('Air/Water sample', fontSize=15)
plt.xscale('log')
plt.xlabel('measured fraction', fontsize=12)
plt.ylabel('mpCountError (%)', fontsize=12)
# plt.ylim([0, 1])
plt.ylim([0, 100])
plt.legend()
errorPerFraction: dict = results.get_error_vs_fraction_data(attributes=['sediment', 'soil', 'beach', 'slush'],
......@@ -55,11 +55,11 @@ for methodLabel in errorPerFraction.keys():
plt.plot(fractions, errors)
plt.scatter(fractions, errors, label=methodLabel)
plt.title('Spiral or Box Layouts on Sedimant/Beach/Slush sample', fontSize=15)
plt.title('Sediment/Beach/Slush sample', fontSize=15)
plt.xscale('log')
plt.xlabel('measured fraction', fontsize=12)
plt.ylabel('mpCountError (%)', fontsize=12)
# plt.ylim([0, 1])
plt.ylim([0, 100])
plt.legend()
plt.show()
......
......@@ -51,11 +51,12 @@ class TestFeatureExtractor(unittest.TestCase):
self.assertFalse(np.any(diff > 0.1))
def test_get_color_hash(self):
for color in ['red', 'green', 'violet', 'blue', 'Blue']:
for color in ['red', 'green', 'violet', 'blue', 'Blue', 'non-determinable', None]:
for numDigits in [4, 6, 8]:
hashNumber: int = abs(hash(color)) % (10**numDigits)
hashNumber: int = abs(hash(color))
hashArray: np.ndarray = self.extractor._get_color_hash(color, numDigits)
for i in range(hashArray.shape[0]):
self.assertEqual(len(hashArray), numDigits)
for i in range(numDigits):
self.assertEqual(hashArray[i], int(str(hashNumber)[i]))
......@@ -79,39 +80,6 @@ class TestChemometricSubsampling(unittest.TestCase):
diff: np.ndarray = features[i, :] - np.mean(features[i, :])
self.assertFalse(np.any(diff > 0.1))
# def test_get_indices_from_clusterLabels(self):
# numClusters: int = 3
# numPointsPerCluster: int = 50
# numNoisePoints: int = 10
#
# # the random_state=1 guarantees same outcome always
# centers: list = [i[0] for i in np.random.rand(numClusters, 1, 2)]
# points, labels = make_blobs(numClusters*numPointsPerCluster, centers=centers, cluster_std=0.3,
# shuffle=False, random_state=1)
# centerIndices: list = [int(round(numPointsPerCluster / 2 + (i*numPointsPerCluster))) for i in range(numClusters)]
#
# noisePoints: np.ndarray = np.random.rand(numNoisePoints, 2)
# noiseLabels: np.ndarray = np.array([-1] * numNoisePoints)
# points = np.vstack((points, noisePoints))
# labels = np.hstack((labels, noiseLabels))
#
# origFraction: float = self.chemSubs.fraction
# numIndicesTotal: float = round(len(labels)) * origFraction
# fractionPerCluster: float = origFraction * (numIndicesTotal - numNoisePoints) / numIndicesTotal
# pointsPerCluster: float = round(numPointsPerCluster * fractionPerCluster)
#
# self.assertEqual(numIndicesTotal, numNoisePoints + numClusters * pointsPerCluster)
#
# # Conversion from list to np.ndarray is to make the below indexing work.
# indices: np.ndarray = np.array(self.chemSubs._get_indices_from_clusterLabels(points, labels, np.array(centerIndices)))
# # Here we only check the correct number of indices, but not if the correct ones
# # (close to the center indices) were selected. However, all noise indices shall be there!
# # self.assertEqual(len(indices), round(len(labels) * self.chemSubs.fraction))
# # self.assertEqual(len(indices[indices < 100]), round(100 * fractionPerCluster)) # Cluster 0
# # self.assertEqual(len(indices[np.logical_and(indices >= 100, indices < 200)]), round(100 * fractionPerCluster)) # Cluster 1
# # self.assertEqual(len(indices[indices >= 200]), round(100 * fractionPerCluster)) # Cluster 2
# # self.assertEqual(len(indices[indices >= 300]), numNoisePoints) # Noise Cluster
def test_get_numPoints_per_cluster(self):
def get_orig_points_per_cluster(index):
return (index+1)*50
......@@ -153,7 +121,6 @@ class TestChemometricSubsampling(unittest.TestCase):
tooFewPoints = numPointsToMeasure < (numClusters + (1 if numNoisePoints > 0 else 0))
pointsFound: int = 0
roundingErrorFound: bool = False
for clusterIndex in pointsPerCluster.keys():
if clusterIndex > -1:
if not tooFewPoints:
......@@ -161,18 +128,8 @@ class TestChemometricSubsampling(unittest.TestCase):
if pointsExpected == 0:
pointsExpected = 1
# if pointsPerCluster[clusterIndex] != pointsExpected:
# print('error')
# if not roundingErrorFound:
diff = abs(pointsPerCluster[clusterIndex] - pointsExpected)
if diff > 1:
print('argh')
self.assertTrue(diff <= 1)
# if diff != 0:
# roundingErrorFound = True
# else:
# self.assertEqual(pointsPerCluster[clusterIndex], pointsExpected)
else:
if pointsFound < numPointsToMeasure:
self.assertEqual(pointsPerCluster[clusterIndex], 1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment