Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
What's new
7
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Open sidebar
Josef Brandt
Subsampling
Commits
5a65802d
Commit
5a65802d
authored
Mar 28, 2020
by
Josef Brandt
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Bugfix in DBSCAN
parent
66624cab
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
58 additions
and
82 deletions
+58
-82
chemometricMethods.py
chemometricMethods.py
+32
-12
evaluation.py
evaluation.py
+2
-3
subsampling.py
subsampling.py
+20
-20
tests/test_chemometricMethods.py
tests/test_chemometricMethods.py
+4
-47
No files found.
chemometricMethods.py
View file @
5a65802d
...
...
@@ -17,10 +17,12 @@ from methods import SubsamplingMethod
def
get_pca
(
data
:
np
.
ndarray
,
numComp
:
int
=
2
)
->
np
.
ndarray
:
try
:
standardizedData
=
StandardScaler
().
fit_transform
(
data
)
standardizedData
=
StandardScaler
().
fit_transform
(
data
.
copy
()
)
except
ValueError
:
print
(
'first standardscaler attempt failed, retrying..'
)
standardizedData
=
StandardScaler
().
fit_transform
(
data
)
print
(
'datashape'
,
data
.
shape
)
print
(
'unique:'
,
np
.
unique
(
data
))
raise
pca
=
PCA
(
n_components
=
numComp
)
princComp
:
np
.
ndarray
=
pca
.
fit_transform
(
np
.
transpose
(
standardizedData
))
return
princComp
...
...
@@ -63,7 +65,11 @@ class ChemometricSubsampling(SubsamplingMethod):
def
apply_subsampling_method
(
self
)
->
list
:
vectors
:
np
.
ndarray
=
self
.
_get_particle_featurematrix
()
princComps
:
np
.
ndarray
=
get_pca
(
vectors
)
try
:
princComps
:
np
.
ndarray
=
get_pca
(
vectors
)
except
ValueError
:
print
(
'numParticles:'
,
len
(
self
.
particleContainer
.
particles
))
print
(
'input featurematrix shape'
,
vectors
.
shape
)
clusterLabels
,
coreIndices
=
do_DBSCAN_clustering
(
princComps
)
indices
:
list
=
self
.
_get_indices_from_clusterLabels
(
princComps
,
clusterLabels
,
coreIndices
)
...
...
@@ -75,11 +81,15 @@ class ChemometricSubsampling(SubsamplingMethod):
return
selectedParticles
def
_get_particle_featurematrix
(
self
)
->
np
.
ndarray
:
"""
:return: np.ndarray, numRows: Features, numCols: Particles
"""
vectors
:
list
=
[]
for
particle
in
self
.
particleContainer
.
particles
:
extractor
:
FeatureExtractor
=
FeatureExtractor
(
particle
)
vectors
.
append
(
extractor
.
get_characteristic_vector
())
vectors
:
np
.
ndarray
=
np
.
transpose
(
np
.
array
(
vectors
))
assert
vectors
.
shape
==
(
11
,
len
(
self
.
particleContainer
.
particles
)),
f
'wrong featureMat-shape:
{
vectors
.
shape
}
'
return
vectors
def
equals
(
self
,
otherMethod
)
->
bool
:
...
...
@@ -98,15 +108,19 @@ class ChemometricSubsampling(SubsamplingMethod):
nPoints
:
int
=
int
(
numPointsPerCluster
[
clusterIndex
])
indicesInCluster
:
np
.
ndarray
=
allIndices
[
labels
==
clusterIndex
]
if
clusterIndex
==
-
1
:
indToAppend
=
sample
(
list
(
indicesInCluster
),
nPoints
)
for
ind
in
sample
(
list
(
indicesInCluster
),
nPoints
):
# assert ind not in indices
indices
.
append
(
ind
)
else
:
clusterPoints
:
np
.
ndarray
=
points
[
indicesInCluster
]
centerPoint
:
np
.
ndarray
=
np
.
mean
(
clusterPoints
,
axis
=
0
)
ind
ToAppend
=
get_n_points_closest_to_point
(
clusterPoints
,
nPoints
,
centerPoint
)
for
i
nd
in
ind
ToAppe
nd
:
indices
.
append
(
i
nd
)
ind
icesToSelect
:
list
=
get_n_points_closest_to_point
(
clusterPoints
,
nPoints
,
centerPoint
)
for
ind
in
indicesToSelect
:
origI
nd
=
ind
icesInCluster
[
i
nd
]
indices
.
append
(
origI
nd
)
assert
len
(
set
(
indices
))
==
len
(
indices
),
f
'The calculated indices contain duplicates, '
\
f
'num duplicates:
{
len
(
indices
)
-
len
(
set
(
indices
))
}
'
return
indices
def
_get_numPoints_per_cluster
(
self
,
labels
:
np
.
ndarray
,
noiseAmpFactor
:
float
=
5
)
->
dict
:
...
...
@@ -169,7 +183,9 @@ class ChemometricSubsampling(SubsamplingMethod):
pointsPerCluster
[
indexWithHighestCount
]
-=
1
totalPointsAdded
-=
1
assert
abs
(
totalPointsAdded
-
numPointsToSelect
)
<=
1
if
not
abs
(
totalPointsAdded
-
numPointsToSelect
)
<=
1
:
print
(
'error'
)
# assert abs(totalPointsAdded - numPointsToSelect) <= 1
for
clusterIndex
in
pointsPerCluster
.
keys
():
assert
0
<=
pointsPerCluster
[
clusterIndex
]
<=
len
(
labels
[
labels
==
clusterIndex
])
return
pointsPerCluster
...
...
@@ -182,8 +198,12 @@ class FeatureExtractor(object):
def
get_characteristic_vector
(
self
)
->
np
.
ndarray
:
log_hu
:
np
.
ndarray
=
self
.
_get_log_hu_moments
()
color
:
np
.
ndarray
=
self
.
_get_color_hash
(
self
.
particle
.
color
)
return
np
.
transpose
(
np
.
hstack
((
log_hu
,
color
)))
color
:
np
.
ndarray
=
self
.
_get_color_hash
(
self
.
particle
.
color
,
desiredLength
=
4
)
vector
:
np
.
ndarray
=
np
.
hstack
((
log_hu
,
color
))
if
len
(
vector
)
!=
11
:
print
(
'error'
)
assert
len
(
vector
)
==
7
+
4
,
f
'wrong feature vector:
{
vector
}
with shape:
{
vector
.
shape
}
'
return
vector
def
_get_log_hu_moments
(
self
)
->
np
.
ndarray
:
moments
:
dict
=
cv2
.
moments
(
self
.
particle
.
contour
)
...
...
@@ -197,5 +217,5 @@ class FeatureExtractor(object):
return
resultMoments
[:,
0
]
def
_get_color_hash
(
self
,
color
:
str
,
desiredLength
:
int
=
4
)
->
np
.
ndarray
:
colorArray
:
list
=
[
int
(
i
)
for
i
in
str
(
abs
(
hash
(
color
))
%
(
10
**
desiredLength
))
]
colorArray
:
list
=
[
int
(
i
)
for
i
in
str
(
abs
(
hash
(
color
))
)[:
desiredLength
]
]
return
np
.
transpose
(
np
.
array
(
colorArray
))
evaluation.py
View file @
5a65802d
...
...
@@ -26,9 +26,9 @@ def get_name_from_directory(dirPath: str) -> str:
class
TotalResults
(
object
):
# methods: list = [meth.RandomSampling, meth.SizeBinFractioning, gmeth.CrossBoxSubSampling,
# gmeth.SpiralBoxSubsampling, cmeth.ChemometricSubsampling]
#
measuredFractions: list = [0.01, 0.05, 0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions
:
list
=
[
0.01
,
0.05
,
0.1
,
0.15
,
0.2
,
0.5
,
0.75
,
0.9
]
# measuredFractions: list = [0.1, 0.15, 0.2, 0.5, 0.75, 0.9]
measuredFractions
:
list
=
[
0.1
,
0.3
,
0.5
,
0.7
,
0.9
]
#
measuredFractions: list = [0.1, 0.3, 0.5, 0.9]
def
__init__
(
self
):
super
(
TotalResults
,
self
).
__init__
()
...
...
@@ -106,7 +106,6 @@ class TotalResults(object):
particleContainer
=
dataset
.
particleContainer
methods
:
list
=
[
meth
.
RandomSampling
(
particleContainer
,
fraction
),
meth
.
SizeBinFractioning
(
particleContainer
,
fraction
)]
boxCreator
:
gmeth
.
BoxSelectionCreator
=
gmeth
.
BoxSelectionCreator
(
dataset
)
methods
+=
boxCreator
.
get_crossBoxSubsamplers_for_fraction
(
fraction
)
methods
+=
boxCreator
.
get_spiralBoxSubsamplers_for_fraction
(
fraction
)
...
...
subsampling.py
View file @
5a65802d
...
...
@@ -10,23 +10,23 @@ SET GEPARD TO EVALUATION BRANCH (WITHOUT THE TILING STUFF), OTHERWISE SOME OF TH
"""
# results: TotalResults = TotalResults()
# pklsInFolders = get_pkls_from_directory(r'C:\Users\xbrjos\Desktop\temp MP\NewDatasets')
#
# for folder in pklsInFolders.keys():
# for samplePath in pklsInFolders[folder]:
# newSampleResult: SampleResult = results.add_sample(samplePath)
# for attr in get_attributes_from_foldername(folder):
# newSampleResult.set_attribute(attr)
#
# t0 = time.time()
# results.update_all()
# print('updating all took', time.time()-t0, 'seconds')
#
# save_results('results1.res', results)
results
:
TotalResults
=
load_results
(
'results1.res'
)
results
.
update_all
(
force
=
True
)
results
:
TotalResults
=
TotalResults
()
pklsInFolders
=
get_pkls_from_directory
(
r
'C:\Users\xbrjos\Desktop\temp MP\NewDatasets'
)
for
folder
in
pklsInFolders
.
keys
():
for
samplePath
in
pklsInFolders
[
folder
]:
newSampleResult
:
SampleResult
=
results
.
add_sample
(
samplePath
)
for
attr
in
get_attributes_from_foldername
(
folder
):
newSampleResult
.
set_attribute
(
attr
)
t0
=
time
.
time
()
results
.
update_all
()
print
(
'updating all took'
,
time
.
time
()
-
t0
,
'seconds'
)
save_results
(
'results1.res'
,
results
)
# results: TotalResults = load_results('results1.res')
# results.update_all(force=True)
# save_results('results1.res', results)
plt
.
clf
()
errorPerFraction
:
dict
=
results
.
get_error_vs_fraction_data
(
attributes
=
[
'air'
,
'water'
],
...
...
@@ -39,11 +39,11 @@ for methodLabel in errorPerFraction.keys():
plt
.
plot
(
fractions
,
errors
)
plt
.
scatter
(
fractions
,
errors
,
label
=
methodLabel
)
plt
.
title
(
'
Spiral or Box Layouts on
Air/Water sample'
,
fontSize
=
15
)
plt
.
title
(
'Air/Water sample'
,
fontSize
=
15
)
plt
.
xscale
(
'log'
)
plt
.
xlabel
(
'measured fraction'
,
fontsize
=
12
)
plt
.
ylabel
(
'mpCountError (%)'
,
fontsize
=
12
)
#
plt.ylim([0, 1])
plt
.
ylim
([
0
,
1
00
])
plt
.
legend
()
errorPerFraction
:
dict
=
results
.
get_error_vs_fraction_data
(
attributes
=
[
'sediment'
,
'soil'
,
'beach'
,
'slush'
],
...
...
@@ -55,11 +55,11 @@ for methodLabel in errorPerFraction.keys():
plt
.
plot
(
fractions
,
errors
)
plt
.
scatter
(
fractions
,
errors
,
label
=
methodLabel
)
plt
.
title
(
'
Spiral or Box Layouts on
Sedim
a
nt/Beach/Slush sample'
,
fontSize
=
15
)
plt
.
title
(
'Sedim
e
nt/Beach/Slush sample'
,
fontSize
=
15
)
plt
.
xscale
(
'log'
)
plt
.
xlabel
(
'measured fraction'
,
fontsize
=
12
)
plt
.
ylabel
(
'mpCountError (%)'
,
fontsize
=
12
)
#
plt.ylim([0, 1])
plt
.
ylim
([
0
,
1
00
])
plt
.
legend
()
plt
.
show
()
...
...
tests/test_chemometricMethods.py
View file @
5a65802d
...
...
@@ -51,11 +51,12 @@ class TestFeatureExtractor(unittest.TestCase):
self
.
assertFalse
(
np
.
any
(
diff
>
0.1
))
def
test_get_color_hash
(
self
):
for
color
in
[
'red'
,
'green'
,
'violet'
,
'blue'
,
'Blue'
]:
for
color
in
[
'red'
,
'green'
,
'violet'
,
'blue'
,
'Blue'
,
'non-determinable'
,
None
]:
for
numDigits
in
[
4
,
6
,
8
]:
hashNumber
:
int
=
abs
(
hash
(
color
))
%
(
10
**
numDigits
)
hashNumber
:
int
=
abs
(
hash
(
color
))
hashArray
:
np
.
ndarray
=
self
.
extractor
.
_get_color_hash
(
color
,
numDigits
)
for
i
in
range
(
hashArray
.
shape
[
0
]):
self
.
assertEqual
(
len
(
hashArray
),
numDigits
)
for
i
in
range
(
numDigits
):
self
.
assertEqual
(
hashArray
[
i
],
int
(
str
(
hashNumber
)[
i
]))
...
...
@@ -79,39 +80,6 @@ class TestChemometricSubsampling(unittest.TestCase):
diff
:
np
.
ndarray
=
features
[
i
,
:]
-
np
.
mean
(
features
[
i
,
:])
self
.
assertFalse
(
np
.
any
(
diff
>
0.1
))
# def test_get_indices_from_clusterLabels(self):
# numClusters: int = 3
# numPointsPerCluster: int = 50
# numNoisePoints: int = 10
#
# # the random_state=1 guarantees same outcome always
# centers: list = [i[0] for i in np.random.rand(numClusters, 1, 2)]
# points, labels = make_blobs(numClusters*numPointsPerCluster, centers=centers, cluster_std=0.3,
# shuffle=False, random_state=1)
# centerIndices: list = [int(round(numPointsPerCluster / 2 + (i*numPointsPerCluster))) for i in range(numClusters)]
#
# noisePoints: np.ndarray = np.random.rand(numNoisePoints, 2)
# noiseLabels: np.ndarray = np.array([-1] * numNoisePoints)
# points = np.vstack((points, noisePoints))
# labels = np.hstack((labels, noiseLabels))
#
# origFraction: float = self.chemSubs.fraction
# numIndicesTotal: float = round(len(labels)) * origFraction
# fractionPerCluster: float = origFraction * (numIndicesTotal - numNoisePoints) / numIndicesTotal
# pointsPerCluster: float = round(numPointsPerCluster * fractionPerCluster)
#
# self.assertEqual(numIndicesTotal, numNoisePoints + numClusters * pointsPerCluster)
#
# # Conversion from list to np.ndarray is to make the below indexing work.
# indices: np.ndarray = np.array(self.chemSubs._get_indices_from_clusterLabels(points, labels, np.array(centerIndices)))
# # Here we only check the correct number of indices, but not if the correct ones
# # (close to the center indices) were selected. However, all noise indices shall be there!
# # self.assertEqual(len(indices), round(len(labels) * self.chemSubs.fraction))
# # self.assertEqual(len(indices[indices < 100]), round(100 * fractionPerCluster)) # Cluster 0
# # self.assertEqual(len(indices[np.logical_and(indices >= 100, indices < 200)]), round(100 * fractionPerCluster)) # Cluster 1
# # self.assertEqual(len(indices[indices >= 200]), round(100 * fractionPerCluster)) # Cluster 2
# # self.assertEqual(len(indices[indices >= 300]), numNoisePoints) # Noise Cluster
def
test_get_numPoints_per_cluster
(
self
):
def
get_orig_points_per_cluster
(
index
):
return
(
index
+
1
)
*
50
...
...
@@ -153,7 +121,6 @@ class TestChemometricSubsampling(unittest.TestCase):
tooFewPoints
=
numPointsToMeasure
<
(
numClusters
+
(
1
if
numNoisePoints
>
0
else
0
))
pointsFound
:
int
=
0
roundingErrorFound
:
bool
=
False
for
clusterIndex
in
pointsPerCluster
.
keys
():
if
clusterIndex
>
-
1
:
if
not
tooFewPoints
:
...
...
@@ -161,18 +128,8 @@ class TestChemometricSubsampling(unittest.TestCase):
if
pointsExpected
==
0
:
pointsExpected
=
1
# if pointsPerCluster[clusterIndex] != pointsExpected:
# print('error')
# if not roundingErrorFound:
diff
=
abs
(
pointsPerCluster
[
clusterIndex
]
-
pointsExpected
)
if
diff
>
1
:
print
(
'argh'
)
self
.
assertTrue
(
diff
<=
1
)
# if diff != 0:
# roundingErrorFound = True
# else:
# self.assertEqual(pointsPerCluster[clusterIndex], pointsExpected)
else
:
if
pointsFound
<
numPointsToMeasure
:
self
.
assertEqual
(
pointsPerCluster
[
clusterIndex
],
1
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment