快速信息增益计算

def information_gain(x, y): def _entropy(values): counts = np.bincount(values) probs = counts[np.nonzero(counts)] / float(len(values)) return - np.sum(probs * np.log(probs)) def _information_gain(feature, y): feature_set_indices = np.nonzero(feature)[1] feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices] entropy_x_set = _entropy(y[feature_set_indices]) entropy_x_not_set = _entropy(y[feature_not_set_indices]) return entropy_before - (((len(feature_set_indices) / float(feature_size)) * entropy_x_set) + ((len(feature_not_set_indices) / float(feature_size)) * entropy_x_not_set)) feature_size = x.shape[0] feature_range = range(0, feature_size) entropy_before = _entropy(y) information_gain_scores = [] for feature in x.T: information_gain_scores.append(_information_gain(feature, y)) return information_gain_scores, []

cProfile.runctx( """for feature in x.T: feature_set_indices = np.nonzero(feature)[1] feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices] values = y[feature_set_indices] counts = np.bincount(values) probs = counts[np.nonzero(counts)] / float(len(values)) entropy_x_set = - np.sum(probs * np.log(probs)) values = y[feature_not_set_indices] counts = np.bincount(values) probs = counts[np.nonzero(counts)] / float(len(values)) entropy_x_not_set = - np.sum(probs * np.log(probs)) result = entropy_before - (((len(feature_set_indices) / float(feature_size)) * entropy_x_set) + ((len(feature_not_set_indices) / float(feature_size)) * entropy_x_not_set)) information_gain_scores.append(result)""", globals(), locals())

ncalls tottime percall cumtime percall filename:lineno(function) 1 60.27 60.27 65.48 65.48 <string>:1(<module>) 16171 1.362 0 2.801 0 csr.py:313(_get_row_slice) 16171 0.523 0 0.892 0 coo.py:201(_check) 16173 0.394 0 0.89 0 compressed.py:101(check_format) 210235 0.297 0 0.297 0 {numpy.core.multiarray.array} 16173 0.287 0 0.331 0 compressed.py:631(prune) 16171 0.197 0 1.529 0 compressed.py:534(tocoo) 16173 0.165 0 1.263 0 compressed.py:20(__init__) 16171 0.139 0 1.669 0 base.py:415(nonzero) 16171 0.124 0 1.201 0 coo.py:111(__init__) 32342 0.123 0 0.123 0 {method 'max' of 'numpy.ndarray' objects} 48513 0.117 0 0.218 0 sputils.py:93(isintlike) 32342 0.114 0 0.114 0 {method 'sum' of 'numpy.ndarray' objects} 16171 0.106 0 3.081 0 csr.py:186(__getitem__) 32342 0.105 0 0.105 0 {numpy.lib._compiled_base.bincount} 32344 0.09 0 0.094 0 base.py:59(set_shape) 210227 0.088 0 0.088 0 {isinstance} 48513 0.081 0 1.777 0 fromnumeric.py:1129(nonzero) 32342 0.078 0 0.078 0 {method 'min' of 'numpy.ndarray' objects} 97032 0.066 0 0.153 0 numeric.py:167(asarray)

3条回答

网友

1楼 · 编辑于 2024-05-16 02:56:54

这是一个使用矩阵运算的版本。一个特性的IG是其类特定分数的平均值。

import numpy as np
from scipy.sparse import issparse
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import check_array
from sklearn.utils.extmath import safe_sparse_dot


def ig(X, y):

    def get_t1(fc, c, f):
        t = np.log2(fc/(c * f))
        t[~np.isfinite(t)] = 0
        return np.multiply(fc, t)

    def get_t2(fc, c, f):
        t = np.log2((1-f-c+fc)/((1-c)*(1-f)))
        t[~np.isfinite(t)] = 0
        return np.multiply((1-f-c+fc), t)

    def get_t3(c, f, class_count, observed, total):
        nfc = (class_count - observed)/total
        t = np.log2(nfc/(c*(1-f)))
        t[~np.isfinite(t)] = 0
        return np.multiply(nfc, t)

    def get_t4(c, f, feature_count, observed, total):
        fnc = (feature_count - observed)/total
        t = np.log2(fnc/((1-c)*f))
        t[~np.isfinite(t)] = 0
        return np.multiply(fnc, t)

    X = check_array(X, accept_sparse='csr')
    if np.any((X.data if issparse(X) else X) < 0):
        raise ValueError("Input X must be non-negative.")

    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1:
        Y = np.append(1 - Y, Y, axis=1)

    # counts

    observed = safe_sparse_dot(Y.T, X)          # n_classes * n_features
    total = observed.sum(axis=0).reshape(1, -1).sum()
    feature_count = X.sum(axis=0).reshape(1, -1)
    class_count = (X.sum(axis=1).reshape(1, -1) * Y).T

    # probs

    f = feature_count / feature_count.sum()
    c = class_count / float(class_count.sum())
    fc = observed / total

    # the feature score is averaged over classes
    scores = (get_t1(fc, c, f) +
            get_t2(fc, c, f) +
            get_t3(c, f, class_count, observed, total) +
            get_t4(c, f, feature_count, observed, total)).mean(axis=0)

    scores = np.asarray(scores).reshape(-1)

    return scores, []

在具有1000个实例和1000个独特功能的数据集上，此实现比不使用矩阵运算的实现快100>；个。

网友

2楼 · 编辑于 2024-05-16 02:56:54

正是这段代码feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices]占用了90%的时间，尝试更改为set操作

网友

3楼 · 编辑于 2024-05-16 02:56:54

不知道一年过去了，这是否还有帮助。但现在我碰巧面临着同样的文本分类任务。我使用为稀疏矩阵提供的nonzero()函数重写了您的代码。然后我扫描nz，计算相应的y_值并计算熵。

以下代码只需几秒钟即可运行news20数据集（使用libsvm稀疏矩阵格式加载）。

def information_gain(X, y):

    def _calIg():
        entropy_x_set = 0
        entropy_x_not_set = 0
        for c in classCnt:
            probs = classCnt[c] / float(featureTot)
            entropy_x_set = entropy_x_set - probs * np.log(probs)
            probs = (classTotCnt[c] - classCnt[c]) / float(tot - featureTot)
            entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
        for c in classTotCnt:
            if c not in classCnt:
                probs = classTotCnt[c] / float(tot - featureTot)
                entropy_x_not_set = entropy_x_not_set - probs * np.log(probs)
        return entropy_before - ((featureTot / float(tot)) * entropy_x_set
                             +  ((tot - featureTot) / float(tot)) * entropy_x_not_set)

    tot = X.shape[0]
    classTotCnt = {}
    entropy_before = 0
    for i in y:
        if i not in classTotCnt:
            classTotCnt[i] = 1
        else:
            classTotCnt[i] = classTotCnt[i] + 1
    for c in classTotCnt:
        probs = classTotCnt[c] / float(tot)
        entropy_before = entropy_before - probs * np.log(probs)

    nz = X.T.nonzero()
    pre = 0
    classCnt = {}
    featureTot = 0
    information_gain = []
    for i in range(0, len(nz[0])):
        if (i != 0 and nz[0][i] != pre):
            for notappear in range(pre+1, nz[0][i]):
                information_gain.append(0)
            ig = _calIg()
            information_gain.append(ig)
            pre = nz[0][i]
            classCnt = {}
            featureTot = 0
        featureTot = featureTot + 1
        yclass = y[nz[1][i]]
        if yclass not in classCnt:
            classCnt[yclass] = 1
        else:
            classCnt[yclass] = classCnt[yclass] + 1
    ig = _calIg()
    information_gain.append(ig)

    return np.asarray(information_gain)

相关问题更多 >

编程相关推荐

热门问题

热门文章