Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions optbinning/binning/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,8 @@ class OptimalBinning(BaseOptimalBinning):
The maximum number of bins after pre-binning (prebins).

min_prebin_size : float (default=0.05)
The fraction of mininum number of records for each prebin.
The fraction of mininum number of records for each prebin
(including missing and ``special_code`` groups).

min_n_bins : int or None, optional (default=None)
The minimum number of bins. If None, then ``min_n_bins`` is
Expand All @@ -301,11 +302,13 @@ class OptimalBinning(BaseOptimalBinning):
a value in ``[0, max_n_prebins]``.

min_bin_size : float or None, optional (default=None)
The fraction of minimum number of records for each bin. If None,
The fraction of minimum number of records for each bin
(including missing and ``special_code`` groups). If None,
``min_bin_size = min_prebin_size``.

max_bin_size : float or None, optional (default=None)
The fraction of maximum number of records for each bin. If None,
The fraction of maximum number of records for each bin
(including missing and ``special_code`` groups). If None,
``max_bin_size = 1.0``.

min_bin_n_nonevent : int or None, optional (default=None)
Expand Down Expand Up @@ -516,6 +519,7 @@ def __init__(self, name="", dtype="numerical", prebinning_method="cart",
self._n_prebins = None
self._n_refinements = 0
self._n_samples = None
self._n_samples_weighted = None
self._optimizer = None
self._solution = None
self._splits_optimal = None
Expand Down Expand Up @@ -711,10 +715,15 @@ def _fit(self, x, y, sample_weight, check_input):
logger.info("Pre-processing started.")

self._n_samples = len(x)
self._n_samples_weighted = sum(sample_weight) if sample_weight is not None else len(x)

if self.verbose:
logger.info("Pre-processing: number of samples: {}"
.format(self._n_samples))
if self._n_samples == self._n_samples_weighted:
logger.info("Pre-processing: number of samples: {}"
.format(self._n_samples))
else:
logger.info("Pre-processing: number of samples: {}. Weighted samples: {}"
.format(self._n_samples, self._n_samples_weighted))

time_preprocessing = time.perf_counter()

Expand Down Expand Up @@ -880,7 +889,7 @@ def _fit_prebinning(self, x, y, y_missing, x_special, y_special, y_others,
class_weight=None, sw_clean=None, sw_missing=None,
sw_special=None, sw_others=None):

min_bin_size = int(np.ceil(self.min_prebin_size * self._n_samples))
min_bin_size = int(np.ceil(self.min_prebin_size * self._n_samples_weighted))

prebinning = PreBinning(method=self.prebinning_method,
n_bins=self.max_n_prebins,
Expand Down Expand Up @@ -916,12 +925,12 @@ def _fit_optimizer(self, splits, n_nonevent, n_event):

# Min/max number of bins
if self.min_bin_size is not None:
min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples))
min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples_weighted))
else:
min_bin_size = self.min_bin_size

if self.max_bin_size is not None:
max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples))
max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples_weighted))
else:
max_bin_size = self.max_bin_size

Expand Down
9 changes: 6 additions & 3 deletions optbinning/binning/binning_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,8 @@ class BinningProcess(Base, BaseEstimator, BaseBinningProcess):
The maximum number of bins after pre-binning (prebins).

min_prebin_size : float (default=0.05)
The fraction of mininum number of records for each prebin.
The fraction of mininum number of records for each prebin
(including missing and ``special_code`` groups).

min_n_bins : int or None, optional (default=None)
The minimum number of bins. If None, then ``min_n_bins`` is
Expand All @@ -459,11 +460,13 @@ class BinningProcess(Base, BaseEstimator, BaseBinningProcess):
a value in ``[0, max_n_prebins]``.

min_bin_size : float or None, optional (default=None)
The fraction of minimum number of records for each bin. If None,
The fraction of minimum number of records for each bin
(including missing and ``special_code`` groups). If None,
``min_bin_size = min_prebin_size``.

max_bin_size : float or None, optional (default=None)
The fraction of maximum number of records for each bin. If None,
The fraction of maximum number of records for each bin
(including missing and ``special_code`` groups). If None,
``max_bin_size = 1.0``.

max_pvalue : float or None, optional (default=None)
Expand Down
23 changes: 16 additions & 7 deletions optbinning/binning/continuous_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,8 @@ class ContinuousOptimalBinning(OptimalBinning):
The maximum number of bins after pre-binning (prebins).

min_prebin_size : float (default=0.05)
The fraction of mininum number of records for each prebin.
The fraction of mininum number of records for each prebin
(including missing and ``special_code`` groups).

min_n_bins : int or None, optional (default=None)
The minimum number of bins. If None, then ``min_n_bins`` is
Expand All @@ -219,11 +220,13 @@ class ContinuousOptimalBinning(OptimalBinning):
a value in ``[0, max_n_prebins]``.

min_bin_size : float or None, optional (default=None)
The fraction of minimum number of records for each bin. If None,
The fraction of minimum number of records for each bin
(including missing and ``special_code`` groups). If None,
``min_bin_size = min_prebin_size``.

max_bin_size : float or None, optional (default=None)
The fraction of maximum number of records for each bin. If None,
The fraction of maximum number of records for each bin
(including missing and ``special_code`` groups). If None,
``max_bin_size = 1.0``.

monotonic_trend : str or None, optional (default="auto")
Expand Down Expand Up @@ -400,6 +403,7 @@ def __init__(self, name="", dtype="numerical", prebinning_method="cart",
self._n_prebins = None
self._n_refinements = 0
self._n_samples = None
self._n_samples_weighted = None
self._optimizer = None
self._splits_optimal = None
self._status = None
Expand Down Expand Up @@ -559,10 +563,15 @@ def _fit(self, x, y, sample_weight, check_input):
logger.info("Pre-processing started.")

self._n_samples = len(x)
self._n_samples_weighted = sum(sample_weight) if sample_weight is not None else len(x)

if self.verbose:
logger.info("Pre-processing: number of samples: {}"
.format(self._n_samples))
if self._n_samples == self._n_samples_weighted:
logger.info("Pre-processing: number of samples: {}"
.format(self._n_samples))
else:
logger.info("Pre-processing: number of samples: {}. Weighted samples: {}"
.format(self._n_samples, self._n_samples_weighted))

time_preprocessing = time.perf_counter()

Expand Down Expand Up @@ -757,12 +766,12 @@ def _fit_optimizer(self, splits, n_records, sums, ssums, stds):
return

if self.min_bin_size is not None:
min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples))
min_bin_size = int(np.ceil(self.min_bin_size * self._n_samples_weighted))
else:
min_bin_size = self.min_bin_size

if self.max_bin_size is not None:
max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples))
max_bin_size = int(np.ceil(self.max_bin_size * self._n_samples_weighted))
else:
max_bin_size = self.max_bin_size

Expand Down
11 changes: 8 additions & 3 deletions optbinning/binning/multiclass_binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ class MulticlassOptimalBinning(OptimalBinning):
The maximum number of bins after pre-binning (prebins).

min_prebin_size : float (default=0.05)
The fraction of mininum number of records for each prebin.
The fraction of mininum number of records for each prebin
(including missing and ``special_code`` groups).

min_n_bins : int or None, optional (default=None)
The minimum number of bins. If None, then ``min_n_bins`` is
Expand All @@ -225,11 +226,13 @@ class MulticlassOptimalBinning(OptimalBinning):
a value in ``[0, max_n_prebins]``.

min_bin_size : float or None, optional (default=None)
The fraction of minimum number of records for each bin. If None,
The fraction of minimum number of records for each bin
(including missing and ``special_code`` groups). If None,
``min_bin_size = min_prebin_size``.

max_bin_size : float or None, optional (default=None)
The fraction of maximum number of records for each bin. If None,
The fraction of maximum number of records for each bin
(including missing and ``special_code`` groups). If None,
``max_bin_size = 1.0``.

monotonic_trend : str, array-like or None, optional (default="auto")
Expand Down Expand Up @@ -360,6 +363,7 @@ def __init__(self, name="", prebinning_method="cart", solver="cp",
self._n_prebins = None
self._n_refinements = 0
self._n_samples = None
self._n_samples_weighted = None
self._optimizer = None
self._splits_optimal = None
self._status = None
Expand Down Expand Up @@ -504,6 +508,7 @@ def _fit(self, x, y, check_input):
logger.info("Pre-processing started.")

self._n_samples = len(x)
self._n_samples_weighted = self._n_samples

if self.verbose:
logger.info("Pre-processing: number of samples: {}"
Expand Down
19 changes: 16 additions & 3 deletions optbinning/binning/prebinning.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,22 @@ def fit(self, x, y, sample_weight=None):
self._splits = est.bin_edges_[0][1:-1]

elif self.method == "cart":
cart_kwargs = {
"min_samples_leaf": self.min_bin_size,
"max_leaf_nodes": self.n_bins}
if sample_weight is None:
cart_kwargs = {
"min_samples_leaf": self.min_bin_size,
"max_leaf_nodes": self.n_bins}
else:
# https://scikit-learn.org/stable/modules/tree.html#tips-on-practical-use
# If the samples are weighted, it will be easier to optimize the tree
# structure using weight-based pre-pruning criterion such as
# min_weight_fraction_leaf, which ensure that leaf nodes contain at
# least a fraction of the overall sum of the sample weights.
cart_kwargs = {
"min_weight_fraction_leaf": min(
0.5, self.min_bin_size / np.sum(sample_weight)
),
"max_leaf_nodes": self.n_bins
}

if self.problem_type == "classification":
cart_kwargs["class_weight"] = self.class_weight
Expand Down