SIMOrthoProgram-Orth_LT1AB-.../Ortho/tool/algorithm/ml/MonteCarloSampling.py

186 lines
6.1 KiB
Python
Raw Normal View History

# -*- coding: UTF-8 -*-
"""
@Project:SalinityMain.py
@File:MonteCarloSampling.py
@Function:基于蒙特卡洛随机抽样的最优特征选择算法
@Contact:
@Author:SHJ
@Date:2021/10/19 11:30
@Version:1.0.0
"""
import numpy as np
from numpy import random
import matplotlib.pyplot as plt
import seaborn as sns
import logging
logger = logging.getLogger("mylog")
def api_sel_feature(x_list, iter=100, alpha=0.5, ts=-0.5, iter_ratio=0.2):
"""
:para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],
Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
:para iter: 迭代次数
:para alpha: 调节因子
:para ts: com_sep_coef的阈值
:para iter_ratio : 迭代次数阈值
:return : True-特征与类别相关度高False-特征与类别相关度低
"""
com_sep_coef_old = cal_com_sep_coef(x_list, alpha)
# print('com_sep_coef_old:', com_sep_coef_old)
if com_sep_coef_old < ts:
return False, com_sep_coef_old
X = np.zeros(1) # x_list组合为行向量X
x_len_list = [] # 记录每个类别x的位置
num_sampler = 0 # 样本总数
t = 0
flag = 0
for x in x_list:
len_x = len(x)
if t == 0:
X = x
x_len_list.append(len_x)
else:
X = np.hstack([X, x])
x_len_list.append(x_len_list[t - 1] + len_x)
num_sampler += len_x
t += 1
x_len_list.pop()
num = int(np.ceil(num_sampler / 3))
for i in range(iter):
# 生成随机数组
randmtx = np.random.rand(1, num)
randmtx_ceil = np.ceil(randmtx * num_sampler).astype(int)
randmtx_ceil = np.sort(randmtx_ceil[0, :]) - 1
# 随机取值,重排后,替换原来的数据,组成新数组
X_new_sel = X.copy()
X_new_sel[randmtx_ceil] = np.random.permutation(X[randmtx_ceil])
X_new_list = np.split(X_new_sel, x_len_list)
com_sep_coef_new = cal_com_sep_coef(X_new_list, alpha)
if com_sep_coef_new <= com_sep_coef_old:
flag += 1
# print('com_sep_coef_new:', com_sep_coef_new)
logger.info('flag:' + str(flag) +', iter:' + str(iter) + ', falg/iter:' + str(int(flag)/int(iter)))
if flag > (iter * iter_ratio):
return False, com_sep_coef_old
return True, com_sep_coef_old
def cal_com_coef(x_list):
"""
:para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
:return com_coef : 类内聚合因子Compactness Coefficient
"""
class_num = len(x_list)
coef_array = np.full((1, class_num), 0.0)
for m in range(class_num):
sample_num = len(x_list[m])
c = np.full((1, sample_num), 0.0)
for u in range(sample_num):
l = np.full((1, sample_num), x_list[m][u])
c[0, u] = np.sum(np.abs(l - x_list[m]))
coef_array[0, m] = np.sum(c) / (sample_num * (sample_num - 1))
com_coef = np.sum(coef_array) / class_num
return com_coef
def cal_sep_coef(x_list):
"""
:para x_list : k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
:return sep_coef : 类间离散度Separation Coefficient
"""
class_num = len(x_list)
coef_list = []
coef_sum = 0
for m in range(class_num):
xm = x_list[m]
l_xm = len(xm)
for n in range(class_num):
if not n == m:
xn = x_list[n]
l_xn = len(xn)
xm = np.expand_dims(xm, 1)
coef_list.append(np.sum(np.abs(xm - xn)) / (l_xm * l_xn))
for coef in coef_list:
coef_sum = coef_sum + coef
if class_num == 1 or class_num == 0:
sep_coef = coef_sum
else:
sep_coef = coef_sum / (class_num * (class_num - 1))
return sep_coef
def cal_com_sep_coef(x_list, alpha = 0.5):
"""
:para x_list: k类别的单个特征的训练样本 [X1,X2,X3,...,Xi,...,Xk],Xi = np.array([x1,x2,x3...xn]), 第i类别的训练样本数为n
:para alpha : 调节因子
:return com_sep_coef: 类内聚合度和类间离散度的因子Compactness- Separation Coeffcient
"""
if not alpha >= 0 and alpha <= 1:
raise ('input_para_alpha beyond (0,1)!')
com_coef = cal_com_coef(x_list)
sep_coef = cal_sep_coef(x_list)
com_sep_coef = alpha * com_coef - (1-alpha) * sep_coef
return com_sep_coef
def get_logistic_rand_number(num, u=0.4): #弃用
randmtx = np.full((1, num), 0.0)
# randmtx[0,0] = np.random.rand(1, 1) #随机初始值
randmtx[0, 0] = 0.5 #初始值
for i in range(1, num):
randmtx[0, i] = u * randmtx[0, i-1]*(1-randmtx[0, i-1])
randmtx = randmtx * 3 * num
randmtx_ceil = np.ceil(randmtx)
# 绘制随机数分布图
# randmty = np.arange(0,num,1)
# randmty = np.expand_dims( randmty, 1)
# fig, axes = plt.subplots(1, 1, figsize=(5, 5))
# axes.scatter(randmty, randmtx_ceil, alpha=.3, label='ground truth')
# axes.legend()
# plt.tight_layout()
# plt.show()
return randmtx_ceil
def test():
'''测试生成随机数'''
# 插入
# a = np.array([3.4, 2.5, 1.8, 4.7, 5.6, 2.1])
# b = np.array([2.5, 4.7, 5.6])
# c = a[[0,1]]
# a[[0,1]] = np.array([1, 1])
# 随机排列
random.shuffle()
# logist随机数
sns.distplot(random.normal(scale=2, size=1000), hist=False, label='normal')
sns.distplot(random.logistic(loc=2, scale=0.5, size=1000), hist=False, label='logistic')
plt.show()
# 绘制随机数
randmtx = random.logistic(loc=0.5, scale=0.5, size=100)
randmtx.sort(axis=0)
randmty = np.arange(0,100,1)
randmty = np.expand_dims(randmty, 1)
fig, axes = plt.subplots(1, 1, figsize=(5, 5))
axes.scatter(randmty, randmtx, alpha=.3, label='ground truth')
axes.legend()
plt.tight_layout()
plt.show()
# if __name__ == '__main__':
# 例子
# x1 = np.array([1, 1.1])
# x2 = np.array([2, 2.1, 2.2])
# x3 = np.array([3, 3.4, 3.1])
# x_list = [x1, x2, x3]
# com_sep_coef = cal_com_sep_coef(x_list, 0.5)
# flag = api_sel_feature(x_list)
# print('done')