369 lines
16 KiB
Python
369 lines
16 KiB
Python
import sklearn # 用于解决打包错误
|
||
import sklearn.utils # 用于解决打包错误
|
||
import sklearn.utils._cython_blas # 用于解决打包错误
|
||
import sklearn.utils._weight_vector # 用于解决打包错误
|
||
import sklearn.neighbors # 用于解决打包错误
|
||
import sklearn.neighbors._typedefs # 用于解决打包错误
|
||
import sklearn.neighbors._partition_nodes # 用于解决打包错误
|
||
import sklearn.neighbors._quad_tree # 用于解决打包错误
|
||
import sklearn.tree._utils # 用于解决打包错误
|
||
from sklearn.cross_decomposition import PLSRegression
|
||
from sklearn.ensemble import ExtraTreesClassifier
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.ensemble import ExtraTreesClassifier
|
||
from sklearn.svm import SVC
|
||
import numpy as np
|
||
from scipy.stats import pearsonr
|
||
from tool.algorithm.image.ImageHandle import ImageHandler
|
||
from tool.algorithm.block.blockprocess import BlockProcess
|
||
import logging
|
||
import os
|
||
import glob
|
||
from PIL import Image
|
||
from tool.file.fileHandle import fileHandle
|
||
import multiprocessing
|
||
logger = logging.getLogger("mylog")
|
||
file = fileHandle()
|
||
|
||
class MachineLeaning:
|
||
"""
|
||
机器学习库
|
||
"""
|
||
def __init__(self):
|
||
pass
|
||
|
||
@staticmethod
|
||
def gene_optimal_train_set(train_data_dic, feature_tif_dir, important_threshold=0.3, correlation_threshold=0.7): # todo 修改特征重要性
|
||
ml = MachineLeaning()
|
||
name_list = ml.get_name_list(feature_tif_dir)
|
||
X_train, Y_train = ml.gene_train_set(train_data_dic, feature_tif_dir)
|
||
optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
|
||
optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
|
||
X_train = X_train[:, optimal_feature]
|
||
logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
|
||
return X_train, Y_train, optimal_feature
|
||
|
||
@ staticmethod
|
||
def sel_optimal_feature(X_train, Y_train, name_list,important_threshold=0.3, correlation_threshold=0.7):
|
||
ml = MachineLeaning()
|
||
optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
|
||
optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
|
||
X_train = X_train[:, optimal_feature]
|
||
logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
|
||
return X_train, Y_train, optimal_feature
|
||
|
||
@staticmethod
|
||
def gene_test_set(feature_tif_dir, optimal_feature):
|
||
"""
|
||
生成测试集
|
||
:param feature_tif_dir : 特征影像路径字典
|
||
:param optimal_feature : 最优特征子集
|
||
:return X_test_list : 分块测试集影像路径
|
||
"""
|
||
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
||
cols = ImageHandler.get_img_width(in_tif_paths[0])
|
||
rows = ImageHandler.get_img_height(in_tif_paths[0])
|
||
workspace_block_tif_path = os.path.join(feature_tif_dir, 'block')
|
||
workspace_block_feature_path = os.path.join(feature_tif_dir, 'feature')
|
||
file.creat_dirs([workspace_block_tif_path, workspace_block_feature_path])
|
||
|
||
# 特征分块
|
||
bp = BlockProcess()
|
||
block_size = bp.get_block_size(rows, cols)
|
||
|
||
bp.cut(feature_tif_dir, workspace_block_tif_path, ['tif', 'tiff'], 'tif', block_size)
|
||
img_dir, img_name = bp.get_file_names(workspace_block_tif_path, ['tif'])
|
||
dir_dict_all = bp.get_same_img(img_dir, img_name)
|
||
|
||
# 选择最优特征子集特征影像
|
||
dir_dict = {}
|
||
for n, key in zip(range(len(dir_dict_all)), dir_dict_all):
|
||
if n in optimal_feature:
|
||
dir_dict.update({key: dir_dict_all[key]})
|
||
logger.info('test_feature:%s', dir_dict.keys())
|
||
logger.info('blocking tifs success!')
|
||
X_test_list = []
|
||
# 特征维度合并
|
||
for key in dir_dict:
|
||
key_name = key
|
||
block_num = len(dir_dict[key])
|
||
break
|
||
for n in range(block_num):
|
||
name = os.path.basename(dir_dict[key_name][n])
|
||
suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
|
||
features_path = os.path.join(workspace_block_feature_path, "features" + suffix) # + "\\features" + suffix
|
||
X_test_list.append(features_path)
|
||
features_array = np.zeros((len(dir_dict), block_size, block_size), dtype='float32')
|
||
for m, value in zip(range(len(dir_dict)), dir_dict.values()):
|
||
features_array[m, :, :] = ImageHandler.get_band_array(value[n])
|
||
features_array[np.isnan(features_array)] = 0.0 # 异常值转为0
|
||
ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
|
||
logger.info('create features matrix success!')
|
||
# file.del_folder(workspace_block_tif_path)
|
||
# file.del_folder(workspace_block_feature_path)
|
||
return X_test_list
|
||
|
||
@staticmethod
|
||
def predict_blok(clf, X_test, rows, cols, img_path, row_begin, col_begin, block_sum, n):
|
||
logger.info('total:%s,block:%s testing data !path:%s', block_sum, n, img_path)
|
||
|
||
Y_test = clf.predict(X_test)
|
||
img = Y_test.reshape(rows, cols)
|
||
out_image = Image.fromarray(img)
|
||
out_image.save(img_path)
|
||
# bp = BlockProcess()
|
||
# bp.assign_spatial_reference_bypoint(row_begin, col_begin, self.__proj, self.__geo, img_path)
|
||
# sr = osr.SpatialReference()
|
||
# sr.ImportFromWkt(self.__proj)
|
||
# geo_transform = (self.__geo[0] + col_begin * self.__geo[1] + row_begin * self.__geo[2],
|
||
# self.__geo[1],
|
||
# self.__geo[2],
|
||
# self.__geo[3] + col_begin * self.__geo[4] + row_begin * self.__geo[5],
|
||
# self.__geo[4],
|
||
# self.__geo[5]
|
||
# )
|
||
# dst_ds = gdal.Open(img_path, gdal.GA_Update)
|
||
# if dst_ds is None:
|
||
# return False
|
||
# dst_ds.SetProjection(sr.ExportToWkt())
|
||
# dst_ds.SetGeoTransform(geo_transform)
|
||
# del dst_ds
|
||
logger.info('total:%s,block:%s test data finished !path:%s', block_sum, n, img_path)
|
||
return True
|
||
|
||
@staticmethod
|
||
def predict(clf, X_test_list, out_tif_name, workspace_processing_path,rows, cols):
|
||
"""
|
||
预测数据
|
||
:param clf : svm模型
|
||
:return X_test_list: 分块测试集影像路径
|
||
"""
|
||
ml = MachineLeaning()
|
||
# 开启多进程处理
|
||
bp = BlockProcess()
|
||
block_size = bp.get_block_size(rows, cols)
|
||
|
||
block_features_dir = X_test_list
|
||
bp_cover_dir = os.path.join(workspace_processing_path, out_tif_name + '\\') # workspace_processing_path + out_tif_name + '\\'
|
||
file.creat_dirs([bp_cover_dir])
|
||
|
||
processes_num = min([len(block_features_dir), multiprocessing.cpu_count() - 1])
|
||
pool = multiprocessing.Pool(processes=processes_num)
|
||
|
||
for path, n in zip(block_features_dir, range(len(block_features_dir))):
|
||
name = os.path.split(path)[1]
|
||
features_array = ImageHandler.get_data(path)
|
||
|
||
X_test = np.reshape(features_array, (features_array.shape[0], features_array[0].size)).T
|
||
|
||
suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
|
||
img_path = os.path.join(bp_cover_dir, out_tif_name + suffix) # bp_cover_dir + out_tif_name + suffix
|
||
row_begin = int(name.split('_')[-4])
|
||
col_begin = int(name.split('_')[-2])
|
||
pool.apply_async(ml.predict_blok, (clf, X_test, block_size, block_size, img_path, row_begin, col_begin, len(block_features_dir), n))
|
||
|
||
pool.close()
|
||
pool.join()
|
||
|
||
# 合并影像
|
||
data_dir = bp_cover_dir
|
||
out_path = workspace_processing_path[0:-1]
|
||
bp.combine(data_dir, cols, rows, out_path, file_type=['tif'], datetype='float32')
|
||
|
||
# 添加地理信息
|
||
cover_path = os.path.join(workspace_processing_path, out_tif_name + ".tif") # workspace_processing_path + out_tif_name + ".tif"
|
||
# bp.assign_spatial_reference_byfile(self.__ref_img_path, cover_path)
|
||
return cover_path
|
||
|
||
@staticmethod
|
||
def get_name_list(feature_tif_dir):
|
||
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
||
name_list = []
|
||
dim = len(in_tif_paths)
|
||
for n, path in zip(range(dim), in_tif_paths):
|
||
name_list.append(str(n)+': '+os.path.split(path)[1])
|
||
logger.info('feature_list:%s', name_list)
|
||
return name_list
|
||
|
||
|
||
@staticmethod
|
||
def gene_train_set(train_data_dic, feature_tif_dir):
|
||
"""
|
||
生成训练集
|
||
:param train_data_dic : 从csv读取的训练数据
|
||
:param feature_tif_dir : 特征影像路径路径
|
||
:return X_train, Y_train : 训练数据
|
||
"""
|
||
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
||
dim = len(in_tif_paths)
|
||
X_train = np.empty(shape=(0, dim))
|
||
Y_train = np.empty(shape=(0, 1))
|
||
|
||
ids = train_data_dic['ids']
|
||
positions = train_data_dic['positions']
|
||
for id, points in zip(ids, positions):
|
||
# for data in train_data_list:
|
||
if points == []:
|
||
raise Exception('data is empty!')
|
||
row, col = zip(*points)
|
||
l = len(points)
|
||
X = np.empty(shape=(l, dim))
|
||
|
||
for n, tif_path in zip(range(dim), in_tif_paths):
|
||
feature_array = ImageHandler.get_data(tif_path)
|
||
feature_array[np.isnan(feature_array)] = 0 # 异常值填充为0
|
||
x = feature_array[row, col].T
|
||
X[:, n] = x
|
||
|
||
Y = np.full((l, 1), id)
|
||
X_train = np.vstack((X_train, X))
|
||
Y_train = np.vstack((Y_train, Y))
|
||
Y_train = Y_train.T[0, :]
|
||
|
||
logger.info("gene_train_set success!")
|
||
return X_train, Y_train
|
||
|
||
@staticmethod
|
||
def standardization(data, num=1):
|
||
# 矩阵标准化到[0,1]
|
||
min = np.nanmin(data)
|
||
max = np.nanmax(data)
|
||
data[np.isnan(data)] = min # 异常值填充为0
|
||
_range = max - min
|
||
return (data - min) / _range * num
|
||
|
||
@staticmethod
|
||
def sel_optimal_feature_set(X_train, Y_train, threshold=0.01):
|
||
"""
|
||
筛选最优特征组合
|
||
"""
|
||
model = ExtraTreesClassifier()
|
||
max = np.max(Y_train)
|
||
if max < 0.1:
|
||
Y_train = (Y_train*10000).astype('int')
|
||
model.fit(X_train, Y_train.astype('int'))
|
||
# select the relative importance of each attribute
|
||
importances = model.feature_importances_
|
||
logger.info('importances:%s,threshold=%s', importances, threshold)
|
||
|
||
importances_resort = -np.sort(-importances) # 从大到小排序
|
||
imp_argsort = np.argsort(-importances) # 输出从大到小的序号
|
||
|
||
optimal_feature = list(imp_argsort[np.where(importances_resort > threshold)]) # 过滤重要性低的特征
|
||
logger.info('optimal_feature:%s', optimal_feature)
|
||
|
||
if len(optimal_feature)==0:
|
||
logger.error('optimal_feature is empty')
|
||
optimal_feature = list(imp_argsort)
|
||
return optimal_feature
|
||
|
||
@staticmethod
|
||
def correlation_map(x, y):
|
||
# https://blog.csdn.net/weixin_39836726/article/details/110783640
|
||
# cc matrix based on scipy pearsonr
|
||
n_row_x = x.shape[0]
|
||
n_row_y = x.shape[0]
|
||
ccmtx_xy = np.empty((n_row_x, n_row_y))
|
||
for n in range(n_row_x):
|
||
for m in range(n_row_y):
|
||
ccmtx_xy[n, m] = pearsonr(x[n, :], y[m, :])[0]
|
||
return ccmtx_xy
|
||
|
||
@staticmethod
|
||
def remove_correlation_feature(X_train,validity_list, threshold=0.85):
|
||
"""
|
||
相关性抑制,去除相关性
|
||
:param X_train : 训练集
|
||
:param validity_list : 最优特征子集
|
||
:param threshold: 相关性阈值
|
||
:return validity_list : 最优特征子集
|
||
"""
|
||
ccmtx = MachineLeaning().correlation_map(X_train[:, validity_list].T, X_train[:, validity_list].T)
|
||
ccmtx = np.abs(ccmtx)
|
||
for r in range(len(validity_list)):
|
||
for c in range(len(validity_list)):
|
||
if c <= r:
|
||
ccmtx[r, c] = 0
|
||
logger.info('correlation_map:\n %s', ccmtx)
|
||
# 相关性大于0.85的特征,删除com_sep_coef较大的特征
|
||
high_corr = np.unique(np.where(ccmtx > threshold)[1]) # 删除的特征序号
|
||
validity_list = np.delete(validity_list, high_corr)
|
||
logger.info('validity_list_corr:%s', validity_list)
|
||
logger.info(validity_list)
|
||
return validity_list
|
||
|
||
@staticmethod
|
||
def gene_train_data(block_features_dir,rows,cols,block_size,measured_data_img):
|
||
# 生成训练集
|
||
X_train = []
|
||
Y_train = []
|
||
|
||
block_rows = int(np.ceil(rows/block_size))
|
||
block_cols = int(np.ceil(cols/block_size))
|
||
|
||
for data, n in zip(measured_data_img, range(len(measured_data_img))):
|
||
row = data[0]
|
||
col = data[1]
|
||
block_row = row//block_size
|
||
block_col = col//block_size
|
||
|
||
if block_row == block_rows-1:
|
||
part_img_row = row - (rows - block_size)
|
||
else:
|
||
part_img_row = row % block_size
|
||
|
||
if block_col == block_cols-1:
|
||
part_img_col = col - (cols-block_size)
|
||
else:
|
||
part_img_col = col % block_size
|
||
|
||
features_path = block_features_dir[block_row*block_rows + block_col]
|
||
features_array = ImageHandler().get_data(features_path)
|
||
|
||
feature = features_array[:, part_img_row, part_img_col]
|
||
if not np.isnan(feature).any() or np.isinf(feature).any():
|
||
X_train.append(list(feature))
|
||
Y_train.append([data[2]])
|
||
logger.info('total:%s,num:%s create train set success!', len(measured_data_img), n)
|
||
return np.array(X_train), np.array(Y_train)
|
||
|
||
@staticmethod
|
||
def trainRF(X_train, Y_train):
|
||
#随机森林
|
||
logger.info('RF trainning')
|
||
clf = RandomForestClassifier()
|
||
clf.fit(X_train, Y_train)
|
||
return clf
|
||
|
||
@staticmethod
|
||
def trainSVM(X_train, Y_train, cost=1, kernel='rbf'):
|
||
logger.info('svm trainning')
|
||
clf = SVC(decision_function_shape='ovo')
|
||
clf.fit(X_train, Y_train)
|
||
SVC(C=cost, cache_size=1000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr',
|
||
degree=3, gamma='auto', kernel=kernel, max_iter=-1, probability=False, random_state=None,
|
||
shrinking=True, tol=0.001, verbose=True)
|
||
return clf
|
||
@staticmethod
|
||
def vegetationPhenology_combine_feature(feature_dir,workspace_processing_path, name, rows, cols, debug =False):
|
||
ml = MachineLeaning()
|
||
path_list = list(glob.glob(os.path.join(feature_dir, '*.tif')))
|
||
#多维矩阵合并为一个
|
||
name_featuresPath_dic = {}
|
||
dim = len(path_list)
|
||
features_path = workspace_processing_path + name + "/"+ name +'_features.tif'
|
||
if debug== False:
|
||
features_array = np.zeros((dim, rows, cols), dtype='float16')
|
||
for m, path in zip(range(dim), path_list):
|
||
data = ImageHandler.get_data(path)
|
||
data = ml.standardization(data)
|
||
features_array[m, :, :] = data
|
||
# 异常值转为0
|
||
features_array[np.isnan(features_array)] = 0.0
|
||
features_array[np.isinf(features_array)] = 0.0
|
||
ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
|
||
name_featuresPath_dic.update({name: features_path})
|
||
return name_featuresPath_dic
|
||
|
||
|