ISCE_INSAR/components/iscesys/DataRetriever/DataRetriever.py

#!/usr/bin/env python3


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Copyright 2012 California Institute of Technology. ALL RIGHTS RESERVED.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# United States Government Sponsorship acknowledged. This software is subject to
# U.S. export control laws and regulations and has been classified as 'EAR99 NLR'
# (No [Export] License Required except when exporting to an embargoed country,
# end user, or in support of a prohibited end use). By downloading this software,
# the user agrees to comply with all applicable U.S. export laws and regulations.
# The user has the responsibility to obtain export licenses, or other export
# authority as may be required before exporting this software to any 'EAR99'
# embargoed foreign country or citizen of those countries.
#
# Author: Giangi Sacco
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


import isce
import zipfile
import os
import sys
from isce import logging
from iscesys.Component.Component import Component
import shutil
from urllib import request
from urllib.parse import urlparse
import time
#Parameters definitions
URL = Component.Parameter('_url',
    public_name = 'URL',default = '',
    type = str,
    mandatory = False,
    doc = "URL where to get the data from")
USERNAME = Component.Parameter('_un',
    public_name='username',
    default = None,
    type = str,
    mandatory = False,
    doc = "Username in case the url is password protected")
PASSWORD = Component.Parameter('_pw',
    public_name='password',
    default = None,
    type = str,
    mandatory = False,
    doc = "Password in case the url is password protected")
DIRECTORY = Component.Parameter('_downloadDir',
    public_name='directory',
    default = './',
    type = str,
    mandatory = False,
    doc = "Location where the file are downloaded")
WAIT = Component.Parameter('_wait',
    public_name='wait',
    default = 5,
    type = float,
    mandatory = False,
    doc = "Wait time between trials when server is down")
NUM_TRIALS = Component.Parameter('_numTrials',
    public_name='number of trials',
    default = 3,
    type = int,
    mandatory = False,
    doc = "Number of times it tries to download the file when server is down")
PROCEED_IF_NO_SERVER = Component.Parameter(
    '_proceedIfNoServer',
    public_name='proceed if no server',
    default=False,
    type=bool,
    mandatory=False,
    doc='Flag to continue even if server is down.'
)
## This class provides a set of convenience method to retrieve and possibly combine different DEMs from  the USGS server.
# \c NOTE: the latitudes and the longitudes that describe the DEMs refer to the bottom left corner of the image.
class DataRetriever(Component):

    def serverUp(self,url,needCredentials=False):
        urlp = urlparse(url)
        server = urlp.scheme + "://" + urlp.netloc
        ret = False
        if needCredentials:
            try:
                request.urlopen(server)
                ret = True
            except Exception as e:
                try:
                    #when server needs credentials trying the url open fails
                    #with one of the below messages
                    if e.reason.reason.count('CERTIFICATE_VERIFY_FAILED'):
                        ret = True
                except:
                    try:
                         if ''.join(e.reason.split()).lower() == 'authorizationrequired':
                             ret = True
                    except:
                        #then assume that the exception was due to the server down
                        ret = False
        else:
            try:
                request.urlopen(server)
                ret = True
            except Exception:
                #in this case assume directly server down
                ret = False

        return ret

    ##
    # Fetches the files in listFiles from  URL
    # @param listFile \c list of the filenames to be retrieved.

    def getFiles(self,listFile):
        try:
            os.makedirs(self._downloadDir)
        except:
            #dir already exists
            pass
        #curl with -O downloads in working dir, so save cwd
        cwd = os.getcwd()
        #move to _downloadDir
        os.chdir(self._downloadDir)
        for fileNow in listFile:
            reason = 'file'
            for i in range(self._numTrials):
                try:
                    if not os.path.exists(fileNow):
                        if(self._un is None or self._pw is None):
                            if not self.serverUp(self._url):
                                reason = 'server'
                                raise Exception
                            if os.path.exists(os.path.join(os.environ['HOME'],'.netrc')):
                                command = 'curl -n  -L -c $HOME/.earthdatacookie -b $HOME/.earthdatacookie -k -f -O ' + os.path.join(self._url,fileNow)
                                print("command = {}".format(command))
                            else:
                                self.logger.error('Please create a .netrc file in your home directory containing\nmachine urs.earthdata.nasa.gov\n\tlogin yourusername\n\tpassword yourpassword')
                                sys.exit(1)

                        else:
                            if not self.serverUp(self._url,True):
                                reason = 'server'
                                raise Exception
                            command = 'curl -k -f -u ' + self._un + ':' + self._pw + ' -O ' + os.path.join(self._url,fileNow)
                        if os.system(command):
                            raise Exception
                    self._downloadReport[fileNow] = self._succeded
                    break
                except Exception as e:
                    if reason == 'file':
                        self.logger.warning('There was a problem in retrieving the file  %s. Requested file seems not present on server.'%(os.path.join(self._url,fileNow)))
                        #if the problem is file missing break the loop that tries when the server is down
                        self._downloadReport[fileNow] = self._failed
                        break
                    elif reason == 'server':
                        if i == self._numTrials - 1 and not self._proceedIfNoServer:
                            self.logger.error('There was a problem in retrieving the file  %s. Check the name of the server or try again later in case the server is momentarily down.'%(os.path.join(self._url,fileNow)))
                            sys.exit(1)
                        if i == self._numTrials - 1 and  self._proceedIfNoServer:
                            self._downloadReport[fileNow] = self._failed
                        else:
                            time.sleep(self._wait)
        #move back to original directory
        self.decompressFiles(listFile,self._downloadReport,os.getcwd())
        self.clean(listFile,self._downloadReport)
        os.chdir(cwd)


    def decompressFiles(self,listFile,report,cwd='./'):
        import tempfile as tf
        for file in listFile:
            if report[file] == self._succeded:
                td = tf.TemporaryDirectory()
                self.decompress(file,td.name)
                self._namesMapping[file] = os.listdir(td.name)
                for name in self._namesMapping[file]:
                    try:
                        shutil.move(os.path.join(td.name,name),cwd)
                    except Exception:
                        #probably file already exists. Remove it and try again
                        try:
                            os.remove(os.path.join(cwd,name))
                            shutil.move(os.path.join(td.name,name),cwd)
                        except Exception:
                            print('Cannot decompress file',name)
                            raise Exception


    def clean(self,listFile,report):
        for file in listFile:
            if report[file] == self._succeded:
                os.remove(file)
    ##
    #After retrieving the files this function prints the status of the download for each file,
    #which could be 'succeeded' or 'failed'

    def printDownloadReport(self):
        for k,v in self._downloadReport.items():
            print('Download of file',k,v,'.')
    ##
    # This function returns a dictionary whose keys are the attempted downloaded files and
    # the values are the status of the download, 'succeed' or 'failed'.
    # @return \c dictionary whose keys are the attempted downloaded files and the values are
    # the status of teh download, 'succeed' or 'failed'.

    def getDownloadReport(self):
        return self._downloadReport


    ##
    # Function that decompresses the file.
    # @param filename \c string  the name of the file to decompress.
    def decompress(self,filename,ddir):
        ex = self.getExtractor(filename)
        ex.extractall(ddir)

    ##
    #Inspecting the file determine the right extractor. If it cannot be determined then assume
    #no compression was used

    def getExtractor(self,filename):
        import tarfile
        import zipfile
        from . import gzipfile

        ret = None
        if(tarfile.is_tarfile(filename)):
            ret = tarfile.TarFile(filename)
        elif(zipfile.is_zipfile(filename)):
            ret = zipfile.ZipFile(filename)
        elif(gzipfile.is_gzipfile(filename)):
            ret = gzipfile.GZipFile(filename)
        else:
            print('Unrecognized archive type')
            raise Exception
        return ret

    @property
    def proceedIfNoServer(self):
        return self._proceedIfNoServer
    @proceedIfNoServer.setter
    def proceedIfNoServer(self,proceedIfNoServer):
        self._proceedIfNoServer = proceedIfNoServer
    @property
    def url(self):
        return self._url
    @url.setter
    def url(self,url):
        self._url = url
    @property
    def un(self):
        return self._un
    @un.setter
    def un(self,un):
        self._un = un
    @property
    def pw(self):
        return self._pw
    @pw.setter
    def pw(self,pw):
        self._pw = pw
    ##
    # Setter function for the download directory.
    # @param ddir \c string directory where the data are downloaded.
    @property
    def downloadDir(self):
        return self._downloadDir
    @downloadDir.setter
    def downloadDir(self,ddir):
        self._downloadDir = ddir

    def __getstate__(self):
        d = dict(self.__dict__)
        del d['logger']
        return d

    def __setstate__(self,d):
        self.__dict__.update(d)
        self.logger = logging.getLogger('isce.iscesys.DataRetriever')
        return


    family = 'dataretriever'
    parameter_list = (
                      URL,
                      USERNAME,
                      PASSWORD,
                      DIRECTORY,
                      WAIT,
                      NUM_TRIALS,
                      PROCEED_IF_NO_SERVER
                      )
    def __init__(self,family = '', name = ''):

        #map of the names before and after decompression
        self._namesMapping = {}
        self._downloadReport = {}
        # Note if _useLocalDirectory is True then the donwloadDir is the local directory
        ##self._downloadDir = os.getcwd()#default to the cwd

        self._failed = 'failed'
        self._succeded = 'succeeded'
        super(DataRetriever, self).__init__(family if family else  self.__class__.family, name=name)
        # logger not defined until baseclass is called

        if not self.logger:
            self.logger = logging.getLogger('isce.iscesys.DataRetriever')