# -*- coding: UTF-8 -*- """ @Project:SalinityMain.py @File:MonteCarloSampling.py @Function:基于蒙特卡洛随机抽样的最优特征选择算法 @Contact: @Author:SHJ @Date:2021/10/19 11:30 @Version:1.0.0 """ import numpy as np from numpy import random import matplotlib.pyplot as plt import seaborn as sns import logging logger = logging.getLogger("mylog") def api_sel_feature(x_list, iter=100, alpha=0.5, ts=-0.5, iter_ratio=0.2): """ :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk], Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n :para iter: 迭代次数 :para alpha: 调节因子 :para ts: com_sep_coef的阈值 :para iter_ratio : 迭代次数阈值 :return : True-特征与类别相关度高,False-特征与类别相关度低 """ com_sep_coef_old = cal_com_sep_coef(x_list, alpha) # print('com_sep_coef_old:', com_sep_coef_old) if com_sep_coef_old < ts: return False, com_sep_coef_old X = np.zeros(1) # x_list组合为行向量X x_len_list = [] # 记录每个类别x的位置 num_sampler = 0 # 样本总数 t = 0 flag = 0 for x in x_list: len_x = len(x) if t == 0: X = x x_len_list.append(len_x) else: X = np.hstack([X, x]) x_len_list.append(x_len_list[t - 1] + len_x) num_sampler += len_x t += 1 x_len_list.pop() num = int(np.ceil(num_sampler / 3)) for i in range(iter): # 生成随机数组 randmtx = np.random.rand(1, num) randmtx_ceil = np.ceil(randmtx * num_sampler).astype(int) randmtx_ceil = np.sort(randmtx_ceil[0, :]) - 1 # 随机取值,重排后,替换原来的数据,组成新数组 X_new_sel = X.copy() X_new_sel[randmtx_ceil] = np.random.permutation(X[randmtx_ceil]) X_new_list = np.split(X_new_sel, x_len_list) com_sep_coef_new = cal_com_sep_coef(X_new_list, alpha) if com_sep_coef_new <= com_sep_coef_old: flag += 1 # print('com_sep_coef_new:', com_sep_coef_new) logger.info('flag:' + str(flag) +', iter:' + str(iter) + ', falg/iter:' + str(int(flag)/int(iter))) if flag > (iter * iter_ratio): return False, com_sep_coef_old return True, com_sep_coef_old def cal_com_coef(x_list): """ :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n :return com_coef : 类内聚合因子(Compactness Coefficient) """ class_num = len(x_list) coef_array = np.full((1, class_num), 0.0) for m in range(class_num): sample_num = len(x_list[m]) c = np.full((1, sample_num), 0.0) for u in range(sample_num): l = np.full((1, sample_num), x_list[m][u]) c[0, u] = np.sum(np.abs(l - x_list[m])) coef_array[0, m] = np.sum(c) / (sample_num * (sample_num - 1)) com_coef = np.sum(coef_array) / class_num return com_coef def cal_sep_coef(x_list): """ :para x_list : k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n :return sep_coef : 类间离散度(Separation Coefficient) """ class_num = len(x_list) coef_list = [] coef_sum = 0 for m in range(class_num): xm = x_list[m] l_xm = len(xm) for n in range(class_num): if not n == m: xn = x_list[n] l_xn = len(xn) xm = np.expand_dims(xm, 1) coef_list.append(np.sum(np.abs(xm - xn)) / (l_xm * l_xn)) for coef in coef_list: coef_sum = coef_sum + coef if class_num == 1 or class_num == 0: sep_coef = coef_sum else: sep_coef = coef_sum / (class_num * (class_num - 1)) return sep_coef def cal_com_sep_coef(x_list, alpha = 0.5): """ :para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n :para alpha : 调节因子 :return com_sep_coef: 类内聚合度和类间离散度的因子(Compactness- Separation Coeffcient) """ if not alpha >= 0 and alpha <= 1: raise ('input_para_alpha beyond (0,1)!') com_coef = cal_com_coef(x_list) sep_coef = cal_sep_coef(x_list) com_sep_coef = alpha * com_coef - (1-alpha) * sep_coef return com_sep_coef def get_logistic_rand_number(num, u=0.4): #弃用 randmtx = np.full((1, num), 0.0) # randmtx[0,0] = np.random.rand(1, 1) #随机初始值 randmtx[0, 0] = 0.5 #初始值 for i in range(1, num): randmtx[0, i] = u * randmtx[0, i-1]*(1-randmtx[0, i-1]) randmtx = randmtx * 3 * num randmtx_ceil = np.ceil(randmtx) # 绘制随机数分布图 # randmty = np.arange(0,num,1) # randmty = np.expand_dims( randmty, 1) # fig, axes = plt.subplots(1, 1, figsize=(5, 5)) # axes.scatter(randmty, randmtx_ceil, alpha=.3, label='ground truth') # axes.legend() # plt.tight_layout() # plt.show() return randmtx_ceil def test(): '''测试生成随机数''' # 插入 # a = np.array([3.4, 2.5, 1.8, 4.7, 5.6, 2.1]) # b = np.array([2.5, 4.7, 5.6]) # c = a[[0,1]] # a[[0,1]] = np.array([1, 1]) # 随机排列 random.shuffle() # logist随机数 sns.distplot(random.normal(scale=2, size=1000), hist=False, label='normal') sns.distplot(random.logistic(loc=2, scale=0.5, size=1000), hist=False, label='logistic') plt.show() # 绘制随机数 randmtx = random.logistic(loc=0.5, scale=0.5, size=100) randmtx.sort(axis=0) randmty = np.arange(0,100,1) randmty = np.expand_dims(randmty, 1) fig, axes = plt.subplots(1, 1, figsize=(5, 5)) axes.scatter(randmty, randmtx, alpha=.3, label='ground truth') axes.legend() plt.tight_layout() plt.show() # if __name__ == '__main__': # 例子 # x1 = np.array([1, 1.1]) # x2 = np.array([2, 2.1, 2.2]) # x3 = np.array([3, 3.4, 3.1]) # x_list = [x1, x2, x3] # com_sep_coef = cal_com_sep_coef(x_list, 0.5) # flag = api_sel_feature(x_list) # print('done')