microproduct/tool/algorithm/ml/machineLearning.py

import sklearn # 用于解决打包错误
import sklearn.utils # 用于解决打包错误
import sklearn.utils._cython_blas # 用于解决打包错误
import sklearn.utils._weight_vector # 用于解决打包错误
import sklearn.neighbors  # 用于解决打包错误
import sklearn.neighbors._typedefs  # 用于解决打包错误
import sklearn.neighbors._partition_nodes # 用于解决打包错误
import sklearn.neighbors._quad_tree # 用于解决打包错误
import sklearn.tree._utils # 用于解决打包错误
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
import numpy as np
from scipy.stats import pearsonr
from tool.algorithm.image.ImageHandle import ImageHandler
from tool.algorithm.block.blockprocess import BlockProcess
import logging
import os
import glob
from PIL import Image
from tool.file.fileHandle import fileHandle
import multiprocessing
logger = logging.getLogger("mylog")
file = fileHandle()

class MachineLeaning:
    """
    机器学习库
    """
    def __init__(self):
        pass

    @staticmethod
    def gene_optimal_train_set(train_data_dic, feature_tif_dir, important_threshold=0.3, correlation_threshold=0.7):    # todo 修改特征重要性
        ml = MachineLeaning()
        name_list = ml.get_name_list(feature_tif_dir)
        X_train, Y_train = ml.gene_train_set(train_data_dic, feature_tif_dir)
        optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
        optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
        X_train = X_train[:, optimal_feature]
        logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
        return X_train, Y_train, optimal_feature

    @ staticmethod
    def sel_optimal_feature(X_train, Y_train, name_list,important_threshold=0.3, correlation_threshold=0.7):
        ml = MachineLeaning()
        optimal_feature = ml.sel_optimal_feature_set(X_train, Y_train, threshold=important_threshold)
        optimal_feature = ml.remove_correlation_feature(X_train, optimal_feature, threshold=correlation_threshold)
        X_train = X_train[:, optimal_feature]
        logger.info('train_feature:%s', np.array(name_list)[optimal_feature])
        return X_train, Y_train, optimal_feature

    @staticmethod
    def gene_test_set(feature_tif_dir, optimal_feature):
        """
        生成测试集
        :param feature_tif_dir : 特征影像路径字典
        :param optimal_feature : 最优特征子集
        :return X_test_list : 分块测试集影像路径
        """
        in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
        cols = ImageHandler.get_img_width(in_tif_paths[0])
        rows = ImageHandler.get_img_height(in_tif_paths[0])
        workspace_block_tif_path = os.path.join(feature_tif_dir, 'block')
        workspace_block_feature_path = os.path.join(feature_tif_dir, 'feature')
        file.creat_dirs([workspace_block_tif_path, workspace_block_feature_path])

        # 特征分块
        bp = BlockProcess()
        block_size = bp.get_block_size(rows, cols)

        bp.cut(feature_tif_dir, workspace_block_tif_path, ['tif', 'tiff'], 'tif', block_size)
        img_dir, img_name = bp.get_file_names(workspace_block_tif_path, ['tif'])
        dir_dict_all = bp.get_same_img(img_dir, img_name)

        # 选择最优特征子集特征影像
        dir_dict = {}
        for n, key in zip(range(len(dir_dict_all)), dir_dict_all):
            if n in optimal_feature:
                dir_dict.update({key: dir_dict_all[key]})
        logger.info('test_feature:%s', dir_dict.keys())
        logger.info('blocking tifs success!')
        X_test_list = []
        # 特征维度合并
        for key in dir_dict:
            key_name = key
            block_num = len(dir_dict[key])
            break
        for n in range(block_num):
            name = os.path.basename(dir_dict[key_name][n])
            suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
            features_path = os.path.join(workspace_block_feature_path, "features" + suffix)  # + "\\features" + suffix
            X_test_list.append(features_path)
            features_array = np.zeros((len(dir_dict), block_size, block_size), dtype='float32')
            for m, value in zip(range(len(dir_dict)), dir_dict.values()):
                features_array[m, :, :] = ImageHandler.get_band_array(value[n])
            features_array[np.isnan(features_array)] = 0.0  # 异常值转为0
            ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
        logger.info('create features matrix success!')
        # file.del_folder(workspace_block_tif_path)
        # file.del_folder(workspace_block_feature_path)
        return X_test_list

    @staticmethod
    def predict_blok(clf, X_test, rows, cols, img_path, row_begin, col_begin, block_sum, n):
        logger.info('total:%s,block:%s testing data !path:%s', block_sum, n, img_path)

        Y_test = clf.predict(X_test)
        img = Y_test.reshape(rows, cols)
        out_image = Image.fromarray(img)
        out_image.save(img_path)
        # bp = BlockProcess()
        # bp.assign_spatial_reference_bypoint(row_begin, col_begin, self.__proj, self.__geo, img_path)
        # sr = osr.SpatialReference()
        # sr.ImportFromWkt(self.__proj)
        # geo_transform = (self.__geo[0] + col_begin * self.__geo[1] + row_begin * self.__geo[2],
        #                  self.__geo[1],
        #                  self.__geo[2],
        #                  self.__geo[3] + col_begin * self.__geo[4] + row_begin * self.__geo[5],
        #                  self.__geo[4],
        #                  self.__geo[5]
        #                  )
        # dst_ds = gdal.Open(img_path, gdal.GA_Update)
        # if dst_ds is None:
        #     return False
        # dst_ds.SetProjection(sr.ExportToWkt())
        # dst_ds.SetGeoTransform(geo_transform)
        # del dst_ds
        logger.info('total:%s,block:%s test data finished !path:%s', block_sum, n, img_path)
        return True

    @staticmethod
    def predict(clf, X_test_list, out_tif_name, workspace_processing_path,rows, cols):
        """
        预测数据
        :param clf : svm模型
        :return X_test_list: 分块测试集影像路径
        """
        ml = MachineLeaning()
        # 开启多进程处理
        bp = BlockProcess()
        block_size = bp.get_block_size(rows, cols)

        block_features_dir = X_test_list
        bp_cover_dir = os.path.join(workspace_processing_path, out_tif_name + '\\')  # workspace_processing_path + out_tif_name + '\\'
        file.creat_dirs([bp_cover_dir])

        processes_num = min([len(block_features_dir), multiprocessing.cpu_count() - 1])
        pool = multiprocessing.Pool(processes=processes_num)

        for path, n in zip(block_features_dir, range(len(block_features_dir))):
            name = os.path.split(path)[1]
            features_array = ImageHandler.get_data(path)

            X_test = np.reshape(features_array, (features_array.shape[0], features_array[0].size)).T

            suffix = '_' + name.split('_')[-4] + "_" + name.split('_')[-3] + "_" + name.split('_')[-2] + "_" + name.split('_')[-1]
            img_path = os.path.join(bp_cover_dir, out_tif_name + suffix)  # bp_cover_dir + out_tif_name + suffix
            row_begin = int(name.split('_')[-4])
            col_begin = int(name.split('_')[-2])
            pool.apply_async(ml.predict_blok, (clf, X_test, block_size, block_size, img_path, row_begin, col_begin, len(block_features_dir), n))

        pool.close()
        pool.join()

        # 合并影像
        data_dir = bp_cover_dir
        out_path = workspace_processing_path[0:-1]
        bp.combine(data_dir, cols, rows, out_path, file_type=['tif'], datetype='float32')

        # 添加地理信息
        cover_path = os.path.join(workspace_processing_path, out_tif_name + ".tif")  # workspace_processing_path + out_tif_name + ".tif"
        # bp.assign_spatial_reference_byfile(self.__ref_img_path, cover_path)
        return cover_path

    @staticmethod
    def get_name_list(feature_tif_dir):
        in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
        name_list = []
        dim = len(in_tif_paths)
        for n, path in zip(range(dim), in_tif_paths):
            name_list.append(str(n)+': '+os.path.split(path)[1])
        logger.info('feature_list:%s', name_list)
        return name_list


    @staticmethod
    def gene_train_set(train_data_dic, feature_tif_dir):
        """
        生成训练集
        :param train_data_dic : 从csv读取的训练数据
        :param feature_tif_dir : 特征影像路径路径
        :return X_train, Y_train : 训练数据
        """
        in_tif_paths = list(glob.glob(os.path.join(feature_tif_dir, '*.tif')))
        dim = len(in_tif_paths)
        X_train = np.empty(shape=(0, dim))
        Y_train = np.empty(shape=(0, 1))

        ids = train_data_dic['ids']
        positions = train_data_dic['positions']
        for id, points in zip(ids, positions):
        # for data in train_data_list:
            if points == []:
                raise Exception('data is empty!')
            row, col = zip(*points)
            l = len(points)
            X = np.empty(shape=(l, dim))

            for n, tif_path in zip(range(dim), in_tif_paths):
                feature_array = ImageHandler.get_data(tif_path)
                feature_array[np.isnan(feature_array)] = 0  # 异常值填充为0
                x = feature_array[row, col].T
                X[:, n] = x

            Y = np.full((l, 1), id)
            X_train = np.vstack((X_train, X))
            Y_train = np.vstack((Y_train, Y))
        Y_train = Y_train.T[0, :]

        logger.info("gene_train_set success!")
        return X_train, Y_train

    @staticmethod
    def standardization(data, num=1):
        # 矩阵标准化到[0,1]
        min = np.nanmin(data)
        max = np.nanmax(data)
        data[np.isnan(data)] = min # 异常值填充为0
        _range = max - min
        return (data - min) / _range * num

    @staticmethod
    def sel_optimal_feature_set(X_train, Y_train, threshold=0.01):
        """
        筛选最优特征组合
        """
        model = ExtraTreesClassifier()
        max = np.max(Y_train)
        if max < 0.1:
            Y_train = (Y_train*10000).astype('int')
        model.fit(X_train, Y_train.astype('int'))
        # select the relative importance of each attribute
        importances = model.feature_importances_
        logger.info('importances:%s,threshold=%s', importances, threshold)

        importances_resort = -np.sort(-importances) # 从大到小排序
        imp_argsort = np.argsort(-importances) # 输出从大到小的序号

        optimal_feature = list(imp_argsort[np.where(importances_resort > threshold)])  # 过滤重要性低的特征
        logger.info('optimal_feature:%s', optimal_feature)

        if len(optimal_feature)==0:
            logger.error('optimal_feature is empty')
            optimal_feature = list(imp_argsort)
        return optimal_feature

    @staticmethod
    def correlation_map(x, y):
        # https://blog.csdn.net/weixin_39836726/article/details/110783640
        # cc matrix based on scipy pearsonr
        n_row_x = x.shape[0]
        n_row_y = x.shape[0]
        ccmtx_xy = np.empty((n_row_x, n_row_y))
        for n in range(n_row_x):
            for m in range(n_row_y):
                ccmtx_xy[n, m] = pearsonr(x[n, :], y[m, :])[0]
        return ccmtx_xy

    @staticmethod
    def remove_correlation_feature(X_train,validity_list, threshold=0.85):
        """
        相关性抑制,去除相关性
        :param X_train : 训练集
        :param validity_list : 最优特征子集
        :param threshold: 相关性阈值
        :return validity_list : 最优特征子集
        """
        ccmtx = MachineLeaning().correlation_map(X_train[:, validity_list].T, X_train[:, validity_list].T)
        ccmtx = np.abs(ccmtx)
        for r in range(len(validity_list)):
            for c in range(len(validity_list)):
                if c <= r:
                    ccmtx[r, c] = 0
        logger.info('correlation_map:\n %s', ccmtx)
        # 相关性大于0.85的特征，删除com_sep_coef较大的特征
        high_corr = np.unique(np.where(ccmtx > threshold)[1]) # 删除的特征序号
        validity_list = np.delete(validity_list, high_corr)
        logger.info('validity_list_corr:%s', validity_list)
        logger.info(validity_list)
        return validity_list

    @staticmethod
    def gene_train_data(block_features_dir,rows,cols,block_size,measured_data_img):
        # 生成训练集
        X_train = []
        Y_train = []

        block_rows = int(np.ceil(rows/block_size))
        block_cols = int(np.ceil(cols/block_size))

        for data, n in zip(measured_data_img, range(len(measured_data_img))):
            row = data[0]
            col = data[1]
            block_row = row//block_size
            block_col = col//block_size

            if block_row == block_rows-1:
                part_img_row = row - (rows - block_size)
            else:
                part_img_row = row % block_size

            if block_col == block_cols-1:
                part_img_col = col - (cols-block_size)
            else:
                part_img_col = col % block_size

            features_path = block_features_dir[block_row*block_rows + block_col]
            features_array = ImageHandler().get_data(features_path)

            feature = features_array[:, part_img_row, part_img_col]
            if not np.isnan(feature).any() or np.isinf(feature).any():
                X_train.append(list(feature))
                Y_train.append([data[2]])
            logger.info('total:%s,num:%s create train set success!', len(measured_data_img), n)
        return np.array(X_train), np.array(Y_train)

    @staticmethod
    def trainRF(X_train, Y_train):
        #随机森林
        logger.info('RF trainning')
        clf = RandomForestClassifier()
        clf.fit(X_train, Y_train)
        return clf

    @staticmethod
    def trainSVM(X_train, Y_train, cost=1, kernel='rbf'):
        logger.info('svm trainning')
        clf = SVC(decision_function_shape='ovo')
        clf.fit(X_train, Y_train)
        SVC(C=cost, cache_size=1000, class_weight='balanced', coef0=0.0, decision_function_shape='ovr',
            degree=3, gamma='auto', kernel=kernel, max_iter=-1, probability=False, random_state=None,
            shrinking=True, tol=0.001, verbose=True)
        return clf
    @staticmethod
    def vegetationPhenology_combine_feature(feature_dir,workspace_processing_path, name, rows, cols, debug =False):
        ml = MachineLeaning()
        path_list = list(glob.glob(os.path.join(feature_dir, '*.tif')))
        #多维矩阵合并为一个
        name_featuresPath_dic = {}
        dim = len(path_list)
        features_path = workspace_processing_path + name + "/"+ name +'_features.tif'
        if debug== False:
            features_array = np.zeros((dim, rows, cols), dtype='float16')
            for m, path in zip(range(dim), path_list):
                data = ImageHandler.get_data(path)
                data = ml.standardization(data)
                features_array[m, :, :] = data
            # 异常值转为0
            features_array[np.isnan(features_array)] = 0.0
            features_array[np.isinf(features_array)] = 0.0
            ImageHandler.write_img(features_path, '', [0, 0, 0, 0, 0, 0], features_array)
        name_featuresPath_dic.update({name: features_path})
        return name_featuresPath_dic