SIMOrthoProgram-Orth_LT1AB-.../Ortho/tool/algorithm/ml/MonteCarloSampling.py

# -*- coding: UTF-8 -*-
"""
@Project:SalinityMain.py
@File:MonteCarloSampling.py
@Function:基于蒙特卡洛随机抽样的最优特征选择算法
@Contact:
@Author:SHJ
@Date:2021/10/19 11:30 
@Version:1.0.0
"""
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logger = logging.getLogger("mylog")


def api_sel_feature(x_list, iter=100, alpha=0.5, ts=-0.5, iter_ratio=0.2):
    """
    :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],
                  Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
    :para iter: 迭代次数
    :para alpha: 调节因子
    :para ts: com_sep_coef的阈值
    :para iter_ratio : 迭代次数阈值
    :return : True-特征与类别相关度高，False-特征与类别相关度低
    """
    com_sep_coef_old = cal_com_sep_coef(x_list, alpha)
    # print('com_sep_coef_old:', com_sep_coef_old)
    if com_sep_coef_old < ts:
        return False, com_sep_coef_old

    X = np.zeros(1)  # x_list组合为行向量X
    x_len_list = []  # 记录每个类别x的位置
    num_sampler = 0  # 样本总数
    t = 0
    flag = 0
    for x in x_list:
        len_x = len(x)
        if t == 0:
            X = x
            x_len_list.append(len_x)
        else:
            X = np.hstack([X, x])
            x_len_list.append(x_len_list[t - 1] + len_x)
        num_sampler += len_x
        t += 1
    x_len_list.pop()
    num = int(np.ceil(num_sampler / 3))

    for i in range(iter):
        # 生成随机数组
        randmtx = np.random.rand(1, num)
        randmtx_ceil = np.ceil(randmtx * num_sampler).astype(int)
        randmtx_ceil = np.sort(randmtx_ceil[0, :]) - 1

        # 随机取值，重排后，替换原来的数据,组成新数组
        X_new_sel = X.copy()
        X_new_sel[randmtx_ceil] = np.random.permutation(X[randmtx_ceil])

        X_new_list = np.split(X_new_sel, x_len_list)
        com_sep_coef_new = cal_com_sep_coef(X_new_list, alpha)
        if com_sep_coef_new <= com_sep_coef_old:
            flag += 1
        # print('com_sep_coef_new:', com_sep_coef_new)
    logger.info('flag:' + str(flag) +', iter:' + str(iter) + ', falg/iter:' + str(int(flag)/int(iter)))
    if flag > (iter * iter_ratio):
        return False, com_sep_coef_old
    return True, com_sep_coef_old

def cal_com_coef(x_list):
    """
    :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
    :return com_coef : 类内聚合因子（Compactness Coefficient）
    """
    class_num = len(x_list)
    coef_array = np.full((1, class_num), 0.0)
    for m in range(class_num):
        sample_num = len(x_list[m])
        c = np.full((1, sample_num), 0.0)
        for u in range(sample_num):
            l = np.full((1, sample_num), x_list[m][u])
            c[0, u] = np.sum(np.abs(l - x_list[m]))
        coef_array[0, m] = np.sum(c) / (sample_num * (sample_num - 1))
        com_coef = np.sum(coef_array) / class_num
    return com_coef

def cal_sep_coef(x_list):
    """
    :para x_list : k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
    :return sep_coef : 类间离散度（Separation Coefficient）
    """
    class_num = len(x_list)
    coef_list = []
    coef_sum = 0
    for m in range(class_num):
        xm = x_list[m]
        l_xm = len(xm)
        for n in range(class_num):
            if not n == m:
                xn = x_list[n]
                l_xn = len(xn)
                xm = np.expand_dims(xm, 1)
                coef_list.append(np.sum(np.abs(xm - xn)) / (l_xm * l_xn))
    for coef in coef_list:
        coef_sum = coef_sum + coef

    if class_num == 1 or class_num == 0:
        sep_coef = coef_sum
    else:
        sep_coef = coef_sum / (class_num * (class_num - 1))
    return sep_coef

def cal_com_sep_coef(x_list, alpha = 0.5):
    """
    :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
    :para alpha : 调节因子
    :return com_sep_coef: 类内聚合度和类间离散度的因子（Compactness- Separation Coeffcient）
    """
    if not alpha >= 0 and alpha <= 1:
        raise ('input_para_alpha beyond (0,1)!')
    com_coef = cal_com_coef(x_list)
    sep_coef = cal_sep_coef(x_list)
    com_sep_coef = alpha * com_coef - (1-alpha) * sep_coef
    return com_sep_coef

def get_logistic_rand_number(num, u=0.4): #弃用
    randmtx = np.full((1, num), 0.0)
    # randmtx[0,0] = np.random.rand(1, 1) #随机初始值
    randmtx[0, 0] = 0.5 #初始值

    for i in range(1, num):
        randmtx[0, i] = u * randmtx[0, i-1]*(1-randmtx[0, i-1])
    randmtx = randmtx * 3 * num
    randmtx_ceil = np.ceil(randmtx)

    # 绘制随机数分布图
    # randmty = np.arange(0,num,1)
    # randmty = np.expand_dims( randmty, 1)
    # fig, axes = plt.subplots(1, 1, figsize=(5, 5))
    # axes.scatter(randmty, randmtx_ceil, alpha=.3, label='ground truth')
    # axes.legend()
    # plt.tight_layout()
    # plt.show()
    return randmtx_ceil

def test():
    '''测试生成随机数'''
    # 插入
    # a = np.array([3.4, 2.5, 1.8, 4.7, 5.6, 2.1])
    # b = np.array([2.5, 4.7, 5.6])
    # c = a[[0,1]]
    # a[[0,1]] = np.array([1, 1])

    # 随机排列
    random.shuffle()

    # logist随机数
    sns.distplot(random.normal(scale=2, size=1000), hist=False, label='normal')
    sns.distplot(random.logistic(loc=2, scale=0.5, size=1000), hist=False, label='logistic')
    plt.show()

    # 绘制随机数
    randmtx = random.logistic(loc=0.5, scale=0.5, size=100)
    randmtx.sort(axis=0)
    randmty = np.arange(0,100,1)
    randmty = np.expand_dims(randmty, 1)
    fig, axes = plt.subplots(1, 1, figsize=(5, 5))
    axes.scatter(randmty, randmtx, alpha=.3, label='ground truth')
    axes.legend()
    plt.tight_layout()
    plt.show()

# if __name__ == '__main__':
    # 例子
    # x1 = np.array([1, 1.1])
    # x2 = np.array([2, 2.1, 2.2])
    # x3 = np.array([3, 3.4, 3.1])
    # x_list = [x1, x2, x3]
    # com_sep_coef = cal_com_sep_coef(x_list, 0.5)
    # flag = api_sel_feature(x_list)
    # print('done')