369 lines
16 KiB
Python
369 lines
16 KiB
Python
|
import sklearn # 用于解决打包错误
|
|||
|
import sklearn.utils # 用于解决打包错误
|
|||
|
import sklearn.utils._cython_blas # 用于解决打包错误
|
|||
|
import sklearn.utils._weight_vector # 用于解决打包错误
|
|||
|
import sklearn.neighbors # 用于解决打包错误
|
|||
|
import sklearn.neighbors._typedefs # 用于解决打包错误
|
|||
|
import sklearn.neighbors._partition_nodes # 用于解决打包错误
|
|||
|
import sklearn.neighbors._quad_tree # 用于解决打包错误
|
|||
|
import sklearn.tree._utils # 用于解决打包错误
|
|||
|
from sklearn.cross_decomposition import PLSRegression
|
|||
|
from sklearn.ensemble import ExtraTreesClassifier
|
|||
|
from sklearn.ensemble import RandomForestClassifier
|
|||
|
from sklearn.ensemble import ExtraTreesClassifier
|
|||
|
from sklearn.svm import SVC
|
|||
|
import numpy as np
|
|||
|
from scipy.stats import pearsonr
|
|||
|
from tool.algorithm.image.ImageHandle import ImageHandler
|
|||
|
from tool.algorithm.block.blockprocess import BlockProcess
|
|||
|
import logging
|
|||
|
import os
|
|||
|
import glob
|
|||
|
from PIL import Image
|
|||
|
from tool.file.fileHandle import fileHandle
|
|||
|
import multiprocessing
|
|||
|
logger = logging.getLogger("mylog")
|
|||
|
file = fileHandle()
|
|||
|
|
|||
|
class MachineLeaning:
|
|||
|
"""
|
|||
|
机器学习库
|
|||
|
"""
|
|||
|
def __init__(self):
|
|||
|
pass
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def gene_optimal_train_set(train_data_dic, feature_tif_dir, important_threshold=0.3, correlation_threshold=0.7): # todo 修改特征重要性
|
|||
|
ml = MachineLeaning()
|
|||
|
name_list = ml.get_name_list(feature_tif_dir)
|
|||
|
X_train, Y_train = ml.gene_train_set(train_data_dic, feature_tif_dir)
|
|||
|
optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
|
|||
|
optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
|
|||
|
X_train = X_train[:, optimal_feature]
|
|||
|
logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
|
|||
|
return X_train, Y_train, optimal_feature
|
|||
|
|
|||
|
@ staticmethod
|
|||
|
def sel_optimal_feature(X_train, Y_train, name_list,important_threshold=0.3, correlation_threshold=0.7):
|
|||
|
ml = MachineLeaning()
|
|||
|
optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
|
|||
|
optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
|
|||
|
X_train = X_train[:, optimal_feature]
|
|||
|
logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
|
|||
|
return X_train, Y_train, optimal_feature
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def gene_test_set(feature_tif_dir, optimal_feature):
|
|||
|
"""
|
|||
|
生成测试集
|
|||
|
:param feature_tif_dir : 特征影像路径字典
|
|||
|
:param optimal_feature : 最优特征子集
|
|||
|
:return X_test_list : 分块测试集影像路径
|
|||
|
"""
|
|||
|
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
|||
|
cols = ImageHandler.get_img_width(in_tif_paths[0])
|
|||
|
rows = ImageHandler.get_img_height(in_tif_paths[0])
|
|||
|
workspace_block_tif_path = os.path.join(feature_tif_dir, 'block')
|
|||
|
workspace_block_feature_path = os.path.join(feature_tif_dir, 'feature')
|
|||
|
file.creat_dirs([workspace_block_tif_path, workspace_block_feature_path])
|
|||
|
|
|||
|
# 特征分块
|
|||
|
bp = BlockProcess()
|
|||
|
block_size = bp.get_block_size(rows, cols)
|
|||
|
|
|||
|
bp.cut(feature_tif_dir, workspace_block_tif_path, ['tif', 'tiff'], 'tif', block_size)
|
|||
|
img_dir, img_name = bp.get_file_names(workspace_block_tif_path, ['tif'])
|
|||
|
dir_dict_all = bp.get_same_img(img_dir, img_name)
|
|||
|
|
|||
|
# 选择最优特征子集特征影像
|
|||
|
dir_dict = {}
|
|||
|
for n, key in zip(range(len(dir_dict_all)), dir_dict_all):
|
|||
|
if n in optimal_feature:
|
|||
|
dir_dict.update({key: dir_dict_all[key]})
|
|||
|
logger.info('test_feature:%s', dir_dict.keys())
|
|||
|
logger.info('blocking tifs success!')
|
|||
|
X_test_list = []
|
|||
|
# 特征维度合并
|
|||
|
for key in dir_dict:
|
|||
|
key_name = key
|
|||
|
block_num = len(dir_dict[key])
|
|||
|
break
|
|||
|
for n in range(block_num):
|
|||
|
name = os.path.basename(dir_dict[key_name][n])
|
|||
|
suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
|
|||
|
features_path = os.path.join(workspace_block_feature_path, "features" + suffix) # + "\\features" + suffix
|
|||
|
X_test_list.append(features_path)
|
|||
|
features_array = np.zeros((len(dir_dict), block_size, block_size), dtype='float32')
|
|||
|
for m, value in zip(range(len(dir_dict)), dir_dict.values()):
|
|||
|
features_array[m, :, :] = ImageHandler.get_band_array(value[n])
|
|||
|
features_array[np.isnan(features_array)] = 0.0 # 异常值转为0
|
|||
|
ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
|
|||
|
logger.info('create features matrix success!')
|
|||
|
# file.del_folder(workspace_block_tif_path)
|
|||
|
# file.del_folder(workspace_block_feature_path)
|
|||
|
return X_test_list
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def predict_blok(clf, X_test, rows, cols, img_path, row_begin, col_begin, block_sum, n):
|
|||
|
logger.info('total:%s,block:%s testing data !path:%s', block_sum, n, img_path)
|
|||
|
|
|||
|
Y_test = clf.predict(X_test)
|
|||
|
img = Y_test.reshape(rows, cols)
|
|||
|
out_image = Image.fromarray(img)
|
|||
|
out_image.save(img_path)
|
|||
|
# bp = BlockProcess()
|
|||
|
# bp.assign_spatial_reference_bypoint(row_begin, col_begin, self.__proj, self.__geo, img_path)
|
|||
|
# sr = osr.SpatialReference()
|
|||
|
# sr.ImportFromWkt(self.__proj)
|
|||
|
# geo_transform = (self.__geo[0] + col_begin * self.__geo[1] + row_begin * self.__geo[2],
|
|||
|
# self.__geo[1],
|
|||
|
# self.__geo[2],
|
|||
|
# self.__geo[3] + col_begin * self.__geo[4] + row_begin * self.__geo[5],
|
|||
|
# self.__geo[4],
|
|||
|
# self.__geo[5]
|
|||
|
# )
|
|||
|
# dst_ds = gdal.Open(img_path, gdal.GA_Update)
|
|||
|
# if dst_ds is None:
|
|||
|
# return False
|
|||
|
# dst_ds.SetProjection(sr.ExportToWkt())
|
|||
|
# dst_ds.SetGeoTransform(geo_transform)
|
|||
|
# del dst_ds
|
|||
|
logger.info('total:%s,block:%s test data finished !path:%s', block_sum, n, img_path)
|
|||
|
return True
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def predict(clf, X_test_list, out_tif_name, workspace_processing_path,rows, cols):
|
|||
|
"""
|
|||
|
预测数据
|
|||
|
:param clf : svm模型
|
|||
|
:return X_test_list: 分块测试集影像路径
|
|||
|
"""
|
|||
|
ml = MachineLeaning()
|
|||
|
# 开启多进程处理
|
|||
|
bp = BlockProcess()
|
|||
|
block_size = bp.get_block_size(rows, cols)
|
|||
|
|
|||
|
block_features_dir = X_test_list
|
|||
|
bp_cover_dir = os.path.join(workspace_processing_path, out_tif_name + '\\') # workspace_processing_path + out_tif_name + '\\'
|
|||
|
file.creat_dirs([bp_cover_dir])
|
|||
|
|
|||
|
processes_num = min([len(block_features_dir), multiprocessing.cpu_count() - 1])
|
|||
|
pool = multiprocessing.Pool(processes=processes_num)
|
|||
|
|
|||
|
for path, n in zip(block_features_dir, range(len(block_features_dir))):
|
|||
|
name = os.path.split(path)[1]
|
|||
|
features_array = ImageHandler.get_data(path)
|
|||
|
|
|||
|
X_test = np.reshape(features_array, (features_array.shape[0], features_array[0].size)).T
|
|||
|
|
|||
|
suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
|
|||
|
img_path = os.path.join(bp_cover_dir, out_tif_name + suffix) # bp_cover_dir + out_tif_name + suffix
|
|||
|
row_begin = int(name.split('_')[-4])
|
|||
|
col_begin = int(name.split('_')[-2])
|
|||
|
pool.apply_async(ml.predict_blok, (clf, X_test, block_size, block_size, img_path, row_begin, col_begin, len(block_features_dir), n))
|
|||
|
|
|||
|
pool.close()
|
|||
|
pool.join()
|
|||
|
|
|||
|
# 合并影像
|
|||
|
data_dir = bp_cover_dir
|
|||
|
out_path = workspace_processing_path[0:-1]
|
|||
|
bp.combine(data_dir, cols, rows, out_path, file_type=['tif'], datetype='float32')
|
|||
|
|
|||
|
# 添加地理信息
|
|||
|
cover_path = os.path.join(workspace_processing_path, out_tif_name + ".tif") # workspace_processing_path + out_tif_name + ".tif"
|
|||
|
# bp.assign_spatial_reference_byfile(self.__ref_img_path, cover_path)
|
|||
|
return cover_path
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def get_name_list(feature_tif_dir):
|
|||
|
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
|||
|
name_list = []
|
|||
|
dim = len(in_tif_paths)
|
|||
|
for n, path in zip(range(dim), in_tif_paths):
|
|||
|
name_list.append(str(n)+': '+os.path.split(path)[1])
|
|||
|
logger.info('feature_list:%s', name_list)
|
|||
|
return name_list
|
|||
|
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def gene_train_set(train_data_dic, feature_tif_dir):
|
|||
|
"""
|
|||
|
生成训练集
|
|||
|
:param train_data_dic : 从csv读取的训练数据
|
|||
|
:param feature_tif_dir : 特征影像路径路径
|
|||
|
:return X_train, Y_train : 训练数据
|
|||
|
"""
|
|||
|
in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
|
|||
|
dim = len(in_tif_paths)
|
|||
|
X_train = np.empty(shape=(0, dim))
|
|||
|
Y_train = np.empty(shape=(0, 1))
|
|||
|
|
|||
|
ids = train_data_dic['ids']
|
|||
|
positions = train_data_dic['positions']
|
|||
|
for id, points in zip(ids, positions):
|
|||
|
# for data in train_data_list:
|
|||
|
if points == []:
|
|||
|
raise Exception('data is empty!')
|
|||
|
row, col = zip(*points)
|
|||
|
l = len(points)
|
|||
|
X = np.empty(shape=(l, dim))
|
|||
|
|
|||
|
for n, tif_path in zip(range(dim), in_tif_paths):
|
|||
|
feature_array = ImageHandler.get_data(tif_path)
|
|||
|
feature_array[np.isnan(feature_array)] = 0 # 异常值填充为0
|
|||
|
x = feature_array[row, col].T
|
|||
|
X[:, n] = x
|
|||
|
|
|||
|
Y = np.full((l, 1), id)
|
|||
|
X_train = np.vstack((X_train, X))
|
|||
|
Y_train = np.vstack((Y_train, Y))
|
|||
|
Y_train = Y_train.T[0, :]
|
|||
|
|
|||
|
logger.info("gene_train_set success!")
|
|||
|
return X_train, Y_train
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def standardization(data, num=1):
|
|||
|
# 矩阵标准化到[0,1]
|
|||
|
min = np.nanmin(data)
|
|||
|
max = np.nanmax(data)
|
|||
|
data[np.isnan(data)] = min # 异常值填充为0
|
|||
|
_range = max - min
|
|||
|
return (data - min) / _range * num
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def sel_optimal_feature_set(X_train, Y_train, threshold=0.01):
|
|||
|
"""
|
|||
|
筛选最优特征组合
|
|||
|
"""
|
|||
|
model = ExtraTreesClassifier()
|
|||
|
max = np.max(Y_train)
|
|||
|
if max < 0.1:
|
|||
|
Y_train = (Y_train*10000).astype('int')
|
|||
|
model.fit(X_train, Y_train.astype('int'))
|
|||
|
# select the relative importance of each attribute
|
|||
|
importances = model.feature_importances_
|
|||
|
logger.info('importances:%s,threshold=%s', importances, threshold)
|
|||
|
|
|||
|
importances_resort = -np.sort(-importances) # 从大到小排序
|
|||
|
imp_argsort = np.argsort(-importances) # 输出从大到小的序号
|
|||
|
|
|||
|
optimal_feature = list(imp_argsort[np.where(importances_resort > threshold)]) # 过滤重要性低的特征
|
|||
|
logger.info('optimal_feature:%s', optimal_feature)
|
|||
|
|
|||
|
if len(optimal_feature)==0:
|
|||
|
logger.error('optimal_feature is empty')
|
|||
|
optimal_feature = list(imp_argsort)
|
|||
|
return optimal_feature
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def correlation_map(x, y):
|
|||
|
# https://blog.csdn.net/weixin_39836726/article/details/110783640
|
|||
|
# cc matrix based on scipy pearsonr
|
|||
|
n_row_x = x.shape[0]
|
|||
|
n_row_y = x.shape[0]
|
|||
|
ccmtx_xy = np.empty((n_row_x, n_row_y))
|
|||
|
for n in range(n_row_x):
|
|||
|
for m in range(n_row_y):
|
|||
|
ccmtx_xy[n, m] = pearsonr(x[n, :], y[m, :])[0]
|
|||
|
return ccmtx_xy
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def remove_correlation_feature(X_train,validity_list, threshold=0.85):
|
|||
|
"""
|
|||
|
相关性抑制,去除相关性
|
|||
|
:param X_train : 训练集
|
|||
|
:param validity_list : 最优特征子集
|
|||
|
:param threshold: 相关性阈值
|
|||
|
:return validity_list : 最优特征子集
|
|||
|
"""
|
|||
|
ccmtx = MachineLeaning().correlation_map(X_train[:, validity_list].T, X_train[:, validity_list].T)
|
|||
|
ccmtx = np.abs(ccmtx)
|
|||
|
for r in range(len(validity_list)):
|
|||
|
for c in range(len(validity_list)):
|
|||
|
if c <= r:
|
|||
|
ccmtx[r, c] = 0
|
|||
|
logger.info('correlation_map:\n %s', ccmtx)
|
|||
|
# 相关性大于0.85的特征,删除com_sep_coef较大的特征
|
|||
|
high_corr = np.unique(np.where(ccmtx > threshold)[1]) # 删除的特征序号
|
|||
|
validity_list = np.delete(validity_list, high_corr)
|
|||
|
logger.info('validity_list_corr:%s', validity_list)
|
|||
|
logger.info(validity_list)
|
|||
|
return validity_list
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def gene_train_data(block_features_dir,rows,cols,block_size,measured_data_img):
|
|||
|
# 生成训练集
|
|||
|
X_train = []
|
|||
|
Y_train = []
|
|||
|
|
|||
|
block_rows = int(np.ceil(rows/block_size))
|
|||
|
block_cols = int(np.ceil(cols/block_size))
|
|||
|
|
|||
|
for data, n in zip(measured_data_img, range(len(measured_data_img))):
|
|||
|
row = data[0]
|
|||
|
col = data[1]
|
|||
|
block_row = row//block_size
|
|||
|
block_col = col//block_size
|
|||
|
|
|||
|
if block_row == block_rows-1:
|
|||
|
part_img_row = row - (rows - block_size)
|
|||
|
else:
|
|||
|
part_img_row = row % block_size
|
|||
|
|
|||
|
if block_col == block_cols-1:
|
|||
|
part_img_col = col - (cols-block_size)
|
|||
|
else:
|
|||
|
part_img_col = col % block_size
|
|||
|
|
|||
|
features_path = block_features_dir[block_row*block_rows + block_col]
|
|||
|
features_array = ImageHandler().get_data(features_path)
|
|||
|
|
|||
|
feature = features_array[:, part_img_row, part_img_col]
|
|||
|
if not np.isnan(feature).any() or np.isinf(feature).any():
|
|||
|
X_train.append(list(feature))
|
|||
|
Y_train.append([data[2]])
|
|||
|
logger.info('total:%s,num:%s create train set success!', len(measured_data_img), n)
|
|||
|
return np.array(X_train), np.array(Y_train)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def trainRF(X_train, Y_train):
|
|||
|
#随机森林
|
|||
|
logger.info('RF trainning')
|
|||
|
clf = RandomForestClassifier()
|
|||
|
clf.fit(X_train, Y_train)
|
|||
|
return clf
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def trainSVM(X_train, Y_train, cost=1, kernel='rbf'):
|
|||
|
logger.info('svm trainning')
|
|||
|
clf = SVC(decision_function_shape='ovo')
|
|||
|
clf.fit(X_train, Y_train)
|
|||
|
SVC(C=cost, cache_size=1000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr',
|
|||
|
degree=3, gamma='auto', kernel=kernel, max_iter=-1, probability=False, random_state=None,
|
|||
|
shrinking=True, tol=0.001, verbose=True)
|
|||
|
return clf
|
|||
|
@staticmethod
|
|||
|
def vegetationPhenology_combine_feature(feature_dir,workspace_processing_path, name, rows, cols, debug =False):
|
|||
|
ml = MachineLeaning()
|
|||
|
path_list = list(glob.glob(os.path.join(feature_dir, '*.tif')))
|
|||
|
#多维矩阵合并为一个
|
|||
|
name_featuresPath_dic = {}
|
|||
|
dim = len(path_list)
|
|||
|
features_path = workspace_processing_path + name + "/"+ name +'_features.tif'
|
|||
|
if debug== False:
|
|||
|
features_array = np.zeros((dim, rows, cols), dtype='float16')
|
|||
|
for m, path in zip(range(dim), path_list):
|
|||
|
data = ImageHandler.get_data(path)
|
|||
|
data = ml.standardization(data)
|
|||
|
features_array[m, :, :] = data
|
|||
|
# 异常值转为0
|
|||
|
features_array[np.isnan(features_array)] = 0.0
|
|||
|
features_array[np.isinf(features_array)] = 0.0
|
|||
|
ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
|
|||
|
name_featuresPath_dic.update({name: features_path})
|
|||
|
return name_featuresPath_dic
|
|||
|
|
|||
|
|