Source code for connectors.nasapower_connector


import h5netcdf
import io
import logging
import numpy as np
import multiprocessing as mp
import os
import pandas as pd
import re
import requests
import s3fs
import sys
import xarray as xr

from tqdm import tqdm

[docs]class NASAPowerClimateDataConnector():
    """This class will provide methods that query and parse data from NASA POWER climate database

        Args:
            logger (str): A pointer to an initialized Argparse logger
            data_source (str): The climate database where the values are being extracted from: SILO or NASAPOWER

    """


    def __init__(self, climate_variables, data_source="silo", input_path=None):

        # Setup logging
        # We need to pass the "logger" to any Classes or Modules that may use it 
        # in our script
        try:
            import coloredlogs
            logger = logging.getLogger('POPBEAST.NASAPOWER_CONNECTOR')
            if 'bestiapop' in __name__:
                coloredlogs.install(fmt='%(asctime)s - %(name)s - %(message)s', level="WARNING", logger=logger)
            else:
                coloredlogs.install(fmt='%(asctime)s - %(name)s - %(message)s', level="DEBUG", logger=logger)

        except ModuleNotFoundError:
            logger = logging.getLogger('POPBEAST.NASAPOWER_CONNECTOR')
            formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')
            console_handler = logging.StreamHandler()
            console_handler.setFormatter(formatter)
            console_handler.setLevel(logging.DEBUG)
            logger.addHandler(console_handler)
            if 'bestiapop' in __name__:
                logger.setLevel(logging.WARNING)
            else:
                logger.setLevel(logging.INFO)

        # Setting up class variables
        self.logger = logger
        self.data_source = data_source
        self.input_path = input_path
        self.climate_data = {}
        self.climate_variables = climate_variables

        # Variable names in NASAPOWER DB
        # nasapower_variables = ["ALLSKY_TOA_SW_DWN", "ALLSKY_SFC_SW_DWN", "T2M", "T2M_MIN", "T2M_MAX", "T2MDEW", "WS2M", "PRECTOT"]

        # Setup Climate Variable Code Translations
        # NASAPOWER Climate variable dict
        self.nasapower_climate_variable_code = {
            "daily_rain":           "PRECTOT", 
            "max_temp":             "T2M_MAX",
            "min_temp":             "T2M_MIN",
            "radiation":            "ALLSKY_SFC_SW_DWN"
        }

        # Generate list with all variables passed as part of the user's request
        # We can then pass this to the API request
        nasapower_climate_variables_list = [self.nasapower_climate_variable_code[x] for x in self.climate_variables]
        self.nasapower_climate_variables_string = ",".join(nasapower_climate_variables_list)

        # Define a lambda for quick translations
        self._Translate_Climate_Var = lambda x: self.nasapower_climate_variable_code[x]

[docs]    def get_yearly_data(self, lat, lon, value_array, year, year_range, climate_variable):
        """Extract values from an API endpoint in the cloud or a xarray.Dataset object

        Args:
            lat (float): the latitude that values should be returned for
            lon (float): the longitude that values should be returned for
            value_array (xarray.Dataset): the xarray Dataset object to extract values from
            year (string): the year of the file
            variable_short_name (string): the climate variable name

        Raises:
            ValueError: if there was "NO" data available for all days under a particular combination of lat & lon, then the total values collected should equal "0" (meaning, there was no data for that point in the grid). If this is the case, then the function will simply return with a "no_values" message and signal the calling function that it should ignore this particular year-lat-lon combination.

        Returns:
            pandas.core.frame.DataFrame: a dataframe containing 5 columns: the Julian day, the grid data value for that day, the year, the latitude, the longitude.

        The NASA POWER database is a global database of daily weather data
        specifically designed for agrometeorological applications. The spatial
        resolution of the database is 0.5x0.5 degrees (as of 2018).
        For more information on the NASA POWER database see the documentation
        at: http://power.larc.nasa.gov/common/AgroclimatologyMethodology/Agro_Methodology_Content.html
        The `NASAPowerClimateDataConnector` is used by BestiaPop to retrieve data
        from NASA POWER database and provides functions to parse and extract relevant
        information from it.
        Important NOTE: as per https://power.larc.nasa.gov/docs/services/api/v1/temporal/daily/,
        any latitude-longitude combinations within a 0.5x0.5 degrees grid box
        will yield the same weather data. Thus, there is no difference for data returned between
        lat/lon -41.5/145.3 and lat/lon -41.8/145.7. When BestiaPop requests data from 
        NASA Power, it will automatically create coordinate series wiht 1 degree jumps. So if you
        pass in `-lat "-41.15 -55.05"` the resulting series will be: [-55.05, -54.05, -53.05, -52.05, -51.05,
        -50.05, -49.05, -48.05, -47.05, -46.05, -45.05, -44.05, -43.05, -42.05, -41.05]. Please bear in mind
        that there is no difference between -41.15 and -41.05.
        """

        # Checking if this is a leap-year  
        if (( year%400 == 0) or (( year%4 == 0 ) and ( year%100 != 0))):
            days = np.arange(0,366,1)
        else: 
            days = np.arange(0,365,1)

        # If we are attempting to read from NasaPower, use it's API instead of Xarray
        if self.input_path is None:
            
            self.logger.info("Extracting data from NASA POWER Climate DataBase")

            try:
                # Attempt to fetch the information from currently available data from a previous API call
                # Check if the coordinates in the available data are different than those being requested
                current_lon, current_lat, current_elev = self.climate_metadata_coordinates
                current_lat = np.round(current_lat, decimals=2) # Need to round values since NASA POWER API returns approximative numbers with 5 decimals
                current_lon = np.round(current_lon, decimals=2) # Need to round values since NASA POWER API returns approximative numbers with 5 decimals
                current_elev = np.round(current_elev, decimals=2)

                if (current_lat != lat) or (current_lon != lon):
                    raise ValueError("InvalidCoordinatesInData")

            # If no current_data available, then proceed to call NasaPower API
            except:
                self.logger.debug("Need to get data from the NASA Power Cloud")

                # Obtaining start and end years for API call
                year_start = year_range[0]
                year_end = year_range[len(year_range)-1]

                nasapower_api_url = "https://power.larc.nasa.gov/cgi-bin/v1/DataAccess.py"

                payload = {
                    "request": "execute",
                    "tempAverage": "DAILY",
                    "identifier": "SinglePoint",
                    "parameters": self.nasapower_climate_variables_string,
                    "lat": lat,
                    "lon": lon,
                    "startDate": "{}0101".format(year_start),
                    "endDate": "{}1231".format(year_end),
                    "userCommunity": "AG",
                    "outputList": "JSON",
                    "user": "anonymous"
                }

                r = requests.get(nasapower_api_url, params=payload)
                json_data = r.json()

                # Shape of data returned by NasaPower
                '''
                    {'features': [
                        {'geometry': {'coordinates': [145.50001, -41.14999, 325.05],
                        'type': 'Point'},
                        'properties': {
                            'parameter': {
                                'ALLSKY_SFC_SW_DWN': {
                                    '20100101': 29.31,
                                    '20100102': 23.84,
                                    '20100103': 18.91,
                                    '20100104': 20.08
                                ...
                                'PRECTOT': {
                                    '20100101': 0.19,
                                    '20100102': 1.75,
                                    '20100103': 1.08,
                                ...
                            }
                        },
                        'type': 'Feature'}],
                        'header': {'api_version': '1.1.0',
                        'endDate': '20101231',
                        'fillValue': '-99',
                        'startDate': '20100101',
                        'title': 'NASA/POWER SRB/FLASHFlux/MERRA2/GEOS 5.12.4 (FP-IT) 0.5 x 0.5 Degree Daily Averaged Data'},
                        'messages': [],
                        'outputs': {'json': 'https://power.larc.nasa.gov/downloads/POWER_SinglePoint_Daily_20100101_20101231_41d15S_145d50E_74ee60c3.json'},
                        'parameterInformation': {
                        'ALLSKY_SFC_SW_DWN': {'longname': 'All Sky Insolation Incident on a Horizontal Surface',
                        'units': 'MJ/m^2/day'},
                        'PRECTOT': {'longname': 'Precipitation', 'units': 'mm day-1'},
                        'T2M_MAX': {'longname': 'Maximum Temperature at 2 Meters', 'units': 'C'},
                        'T2M_MIN': {'longname': 'Minimum Temperature at 2 Meters', 'units': 'C'}},
                        'time': [['Main OPeNDAP Requests:', 0.46], ['Total Script:', 1.91]],
                        'type': 'FeatureCollection'}
                '''

                # Capture all the climate variables inside this class object to not have to repeat calls to the cloud API
                self.climate_metadata_coordinates = json_data['features'][0]['geometry']['coordinates']
                self.climate_data = json_data['features'][0]['properties']['parameter']

            # Proceed to extract the values into a list for each day in the year
            translated_climate_variable = self._Translate_Climate_Var(climate_variable)
            data_values = [self.climate_data[translated_climate_variable][x] for x in self.climate_data[translated_climate_variable] if int(x[:4:]) == year]

            #data_values = [np.round(current_data[x], decimals=1) for x in current_data if x[:4:] == year]

        # If we are not extracting data directly from the cloud, then proceed to extract locally from NetCDF4 files
        elif self.input_path is not None:
            # Using a list comprehension to capture all daily values for the given year and lat/lon combinations
            # We round values to a single decimal
            self.logger.debug("Reading array data from NetCDF with xarray")

            data_values = [np.round(x, decimals=1) for x in value_array[climate_variable].sel(lat=lat, lon=lon).values]

            # closing handle to xarray DataSet
            value_array.close()

        # We have captured all 365 or 366 values, however, they could all be NaN (non existent)
        # If this is the case, skip it
        # NOTE: we could have filtered this in the list comprehension above, however
        # we chose to do it here for code readability.
        # We assume that, if the first value is "NaN" then the rest of the 364 values will also be null
        # data_values = [x for x in data_values if np.isnan(x) != True]

        if np.isnan(data_values[1]) == True:
            data_values = []

        # We need to get the total amount of values collected
        # if there was "NO" data available for all days under a particular combination
        # of lat & lon, then the total values collected should equal "0"
        # (meaning, there was no data for that point in the grid)
        # If this is the case, then the function will simply return with
        # a "no_values"
        if len(data_values) == 0:
            # DEBUG - ERASE
            self.logger.warning("THERE ARE NO VALUES FOR LAT {} LON {} VARIABLE {}".format(lat, lon, climate_variable))
            raise ValueError('no_data_for_lat_lon')

        # now we need to fill a PANDAS DataFrame with the lists we've been collecting
        pandas_dict_of_items = {'days': days,
                                climate_variable: data_values}

        df = pd.DataFrame.from_dict(pandas_dict_of_items)

        # Fixing -99 invalid values from NASAPOWER
        df[climate_variable] = df[climate_variable].apply(lambda x: np.NaN if x < -98 else x)
        df[climate_variable] = (df[climate_variable].ffill()+df[climate_variable].bfill())/2

        # making the julian day match the expected
        df['days'] += 1

        # adding a column with the "year" to the df
        # so as to prepare it for export to other formats (CSV, MET, etc.)
        df.insert(0, 'year', year)
        df.insert(0, 'lat', lat)
        df.insert(0, 'lon', lon)

        return df

[docs]    def generate_climate_dataframe_from_nasapower_cloud_api(self, year_range, climate_variables, lat_range, lon_range, input_dir):
        """This function generates a dataframe containing (a) climate values (b) for every variable requested (c) for every day of the year (d) for every year passed in as argument. It will leverage NASAPOWER API to do it.

        Args:
            year_range (numpy.ndarray): a numpy array with all the years for which we are seeking data.
            climate_variables (str): the climate variable short name as per SILO nomenclature. For SILO check https://www.longpaddock.qld.gov.au/silo/about/climate-variables/. Variable names are automatically translated from SILO to NASAPOWER codes.
            lat_range (numpy.ndarray): a numpy array of latitude values to extract data from
            lon_range (numpy.ndarray): a numpy array of longitude values to extract data from
            input_dir (str): when selecting the option to generate Climate Data Files from local directories, this parameter must be specified, otherwise data will be fetched directly from the cloud either via an available API or S3 bucket.

        Returns:
            tuple: a tuple consisting of (a) the final dataframe containing values for all years, latitudes and longitudes for a particular climate variable, (b) the curated list of longitude ranges (which excludes all those lon values where there were no actual data points). The tuple is ordered as follows: (final_dataframe, final_lon_range)
        """

        # We will iterate through each "latitude" value and, 
        # within this loop, we will iterate through all the different 
        # "longitude" values for a given year. Results for each year
        # are collected inside the "climate_df" with "climate_df.append"
        # At the end, it will output a file with all the contents if
        # "output_to_file=True" (by default it is "True")

        self.logger.debug('Generating DataFrames')

        # empty df to append all the climate_df to
        total_climate_df = pd.DataFrame()

        # create an empty list to keep track of lon coordinates
        # where there are no values
        empty_lon_coordinates = []

        # Now iterating over lat and lon combinations
        # Each year-lat-lon matrix generates a different file
        for lat in tqdm(lat_range, file=sys.stdout, ascii=True, desc="Total Progress"):

            for lon in lon_range:

                if lon in empty_lon_coordinates:
                    continue

                for climate_variable in climate_variables:

                    self.logger.debug('Processing data for climate variable {}'.format(climate_variable))

                    # Loading and/or Downloading the files
                    for year in year_range:

                        self.logger.debug('Processing data for year {}'.format(year))

                        self.logger.debug('Processing Variable {} - Lat {} - Lon {} for Year {}'.format(climate_variable, lat, lon, year))

                        # here we are checking whether the get_values_from_cdf function
                        # returns with a ValueError (meaning there were no values for
                        # that particular lat & long combination). If it does return
                        # with an error, we skip this loop and don't produce any output files

                        try:
                            var_year_lat_lon_df = self.get_yearly_data(lat, lon, None, year, year_range, climate_variable)                          

                        except ValueError:
                            self.logger.warning("This longitude value will be skipped for the rest of the climate variables and years")
                            self.logger.warning("Deleting lon {} in array position {}".format(lon, np.where(lon_range == lon)[0][0]))
                            # Append empty lon value to list
                            empty_lon_coordinates.append(lon)
                            continue
                        
                        # delete the var_year_lat_lon_df back to zero.
                        total_climate_df = total_climate_df.append(var_year_lat_lon_df)
                        del var_year_lat_lon_df

        # Remove any empty lon values from longitude array so as to avoid empty MET generation
        empty_lon_array = np.array(empty_lon_coordinates)
        final_lon_range = np.setdiff1d(lon_range, empty_lon_array)

        # Return results in a touple
        return (total_climate_df, final_lon_range)