Source code for connectors.nasapower_connector


import h5netcdf
import io
import logging
import numpy as np
import multiprocessing as mp
import os
import pandas as pd
import re
import requests
import s3fs
import sys
import xarray as xr

from tqdm import tqdm

[docs]class NASAPowerClimateDataConnector(): """This class will provide methods that query and parse data from NASA POWER climate database Args: logger (str): A pointer to an initialized Argparse logger data_source (str): The climate database where the values are being extracted from: SILO or NASAPOWER """ def __init__(self, climate_variables, data_source="silo", input_path=None): # Setup logging # We need to pass the "logger" to any Classes or Modules that may use it # in our script try: import coloredlogs logger = logging.getLogger('POPBEAST.NASAPOWER_CONNECTOR') if 'bestiapop' in __name__: coloredlogs.install(fmt='%(asctime)s - %(name)s - %(message)s', level="WARNING", logger=logger) else: coloredlogs.install(fmt='%(asctime)s - %(name)s - %(message)s', level="DEBUG", logger=logger) except ModuleNotFoundError: logger = logging.getLogger('POPBEAST.NASAPOWER_CONNECTOR') formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s') console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) console_handler.setLevel(logging.DEBUG) logger.addHandler(console_handler) if 'bestiapop' in __name__: logger.setLevel(logging.WARNING) else: logger.setLevel(logging.INFO) # Setting up class variables self.logger = logger self.data_source = data_source self.input_path = input_path self.climate_data = {} self.climate_variables = climate_variables # Variable names in NASAPOWER DB # nasapower_variables = ["ALLSKY_TOA_SW_DWN", "ALLSKY_SFC_SW_DWN", "T2M", "T2M_MIN", "T2M_MAX", "T2MDEW", "WS2M", "PRECTOT"] # Setup Climate Variable Code Translations # NASAPOWER Climate variable dict self.nasapower_climate_variable_code = { "daily_rain": "PRECTOTCORR", "max_temp": "T2M_MAX", "min_temp": "T2M_MIN", "radiation": "ALLSKY_SFC_SW_DWN" } # Generate list with all variables passed as part of the user's request # We can then pass this to the API request nasapower_climate_variables_list = [self.nasapower_climate_variable_code[x] for x in self.climate_variables] self.nasapower_climate_variables_string = ",".join(nasapower_climate_variables_list) # Define a lambda for quick translations self._Translate_Climate_Var = lambda x: self.nasapower_climate_variable_code[x]
[docs] def get_yearly_data(self, lat, lon, value_array, year, year_range, climate_variable): """Extract values from an API endpoint in the cloud or a xarray.Dataset object Args: lat (float): the latitude that values should be returned for lon (float): the longitude that values should be returned for value_array (xarray.Dataset): the xarray Dataset object to extract values from year (string): the year of the file variable_short_name (string): the climate variable name Raises: ValueError: if there was "NO" data available for all days under a particular combination of lat & lon, then the total values collected should equal "0" (meaning, there was no data for that point in the grid). If this is the case, then the function will simply return with a "no_values" message and signal the calling function that it should ignore this particular year-lat-lon combination. Returns: pandas.core.frame.DataFrame: a dataframe containing 5 columns: the Julian day, the grid data value for that day, the year, the latitude, the longitude. The NASA POWER database is a global database of daily weather data specifically designed for agrometeorological applications. The spatial resolution of the database is 0.5x0.5 degrees (as of 2018). For more information on the NASA POWER database see the documentation at: http://power.larc.nasa.gov/common/AgroclimatologyMethodology/Agro_Methodology_Content.html The `NASAPowerClimateDataConnector` is used by BestiaPop to retrieve data from NASA POWER database and provides functions to parse and extract relevant information from it. Important NOTE: as per https://power.larc.nasa.gov/docs/services/api/v1/temporal/daily/, any latitude-longitude combinations within a 0.5x0.5 degrees grid box will yield the same weather data. Thus, there is no difference for data returned between lat/lon -41.5/145.3 and lat/lon -41.8/145.7. When BestiaPop requests data from NASA Power, it will automatically create coordinate series wiht 1 degree jumps. So if you pass in `-lat "-41.15 -55.05"` the resulting series will be: [-55.05, -54.05, -53.05, -52.05, -51.05, -50.05, -49.05, -48.05, -47.05, -46.05, -45.05, -44.05, -43.05, -42.05, -41.05]. Please bear in mind that there is no difference between -41.15 and -41.05. """ # Checking if this is a leap-year if (( year%400 == 0) or (( year%4 == 0 ) and ( year%100 != 0))): days = np.arange(0,366,1) else: days = np.arange(0,365,1) # If we are attempting to read from NasaPower, use it's API instead of Xarray if self.input_path is None: self.logger.info("Extracting data from NASA POWER Climate DataBase") try: # Attempt to fetch the information from currently available data from a previous API call # Check if the coordinates in the available data are different than those being requested current_lon, current_lat, current_elev = self.climate_metadata_coordinates current_lat = np.round(current_lat, decimals=2) # Need to round values since NASA POWER API returns approximative numbers with 5 decimals current_lon = np.round(current_lon, decimals=2) # Need to round values since NASA POWER API returns approximative numbers with 5 decimals current_elev = np.round(current_elev, decimals=2) if (current_lat != lat) or (current_lon != lon): raise ValueError("InvalidCoordinatesInData") # If no current_data available, then proceed to call NasaPower API except: self.logger.debug("Need to get data from the NASA Power Cloud") # Obtaining start and end years for API call year_start = year_range[0] year_end = year_range[len(year_range)-1] nasapower_api_url = "https://power.larc.nasa.gov/api/temporal/daily/point" payload = { "request": "execute", "tempAverage": "DAILY", "identifier": "SinglePoint", "parameters": self.nasapower_climate_variables_string, "latitude": lat, "longitude": lon, "start": "{}0101".format(year_start), "end": "{}1231".format(year_end), "community": "ag", "format": "json", "user": "anonymous", "header":"true", "time-standard":"lst" } r = requests.get(nasapower_api_url, params=payload) json_data = r.json() # Shape of data returned by NasaPower V2 (Original Bestiapop was written based on NASAPOWER API V1). # NASAPOWER API V2 has changed a little bit the JSON structure and name of PRECTOT by PRECTOTCORR. ''' { 'type': 'Feature', 'geometry': { 'type': 'Point', 'coordinates': [ 145.5, -41.15, 173.75 ] }, 'properties': { 'parameter': { 'ALLSKY_SFC_SW_DWN': { '20160101': 28.56, '20160102': 26.17 ... }, 'T2M_MAX': { '20160101': 26.89, '20160102': 20.3 ...}, 'T2M_MIN': { '20160101': 16.97, '20160102': 15.45 ...}, 'PRECTOTCORR': { '20160101': 0.01, '20160102': 0.13 ...} } }, 'header': { 'title': 'NASA/POWER CERES/MERRA2 Native Resolution Daily Data', 'api': { 'version': 'v2.2.15', 'name': 'POWER Daily API' }, 'fill_value': -999.0, 'start': '20160101', 'end': '20171231' }, 'messages': [], 'parameters': { 'ALLSKY_SFC_SW_DWN': {'units': 'MJ/m^2/day','longname': 'All Sky Surface Shortwave Downward Irradiance'}, 'T2M_MAX': {'units': 'C', 'longname': 'Temperature at 2 Meters Maximum'}, 'T2M_MIN': {'units': 'C', 'longname': 'Temperature at 2 Meters Minimum'}, 'PRECTOTCORR': {'units': 'mm/day', 'longname': 'Precipitation Corrected'}}, 'times': { 'data': 2.03, 'process': 0.02 } } ''' # Capture all the climate variables inside this class object to not have to repeat calls to the cloud API self.climate_metadata_coordinates = json_data['geometry']['coordinates'] self.climate_data = json_data['properties']['parameter'] # Proceed to extract the values into a list for each day in the year translated_climate_variable = self._Translate_Climate_Var(climate_variable) data_values = [self.climate_data[translated_climate_variable][x] for x in self.climate_data[translated_climate_variable] if int(x[:4:]) == year] #data_values = [np.round(current_data[x], decimals=1) for x in current_data if x[:4:] == year] # If we are not extracting data directly from the cloud, then proceed to extract locally from NetCDF4 files elif self.input_path is not None: # Using a list comprehension to capture all daily values for the given year and lat/lon combinations # We round values to a single decimal self.logger.debug("Reading array data from NetCDF with xarray") data_values = [np.round(x, decimals=1) for x in value_array[climate_variable].sel(lat=lat, lon=lon).values] # closing handle to xarray DataSet value_array.close() # We have captured all 365 or 366 values, however, they could all be NaN (non existent) # If this is the case, skip it # NOTE: we could have filtered this in the list comprehension above, however # we chose to do it here for code readability. # We assume that, if the first value is "NaN" then the rest of the 364 values will also be null # data_values = [x for x in data_values if np.isnan(x) != True] if np.isnan(data_values[1]) == True: data_values = [] # We need to get the total amount of values collected # if there was "NO" data available for all days under a particular combination # of lat & lon, then the total values collected should equal "0" # (meaning, there was no data for that point in the grid) # If this is the case, then the function will simply return with # a "no_values" if len(data_values) == 0: # DEBUG - ERASE self.logger.warning("THERE ARE NO VALUES FOR LAT {} LON {} VARIABLE {}".format(lat, lon, climate_variable)) raise ValueError('no_data_for_lat_lon') # now we need to fill a PANDAS DataFrame with the lists we've been collecting pandas_dict_of_items = {'days': days, climate_variable: data_values} df = pd.DataFrame.from_dict(pandas_dict_of_items) # Fixing -99 invalid values from NASAPOWER df[climate_variable] = df[climate_variable].apply(lambda x: np.NaN if x < -98 else x) df[climate_variable] = (df[climate_variable].ffill()+df[climate_variable].bfill())/2 # making the julian day match the expected df['days'] += 1 # adding a column with the "year" to the df # so as to prepare it for export to other formats (CSV, MET, etc.) df.insert(0, 'year', year) df.insert(0, 'lat', lat) df.insert(0, 'lon', lon) return df
[docs] def generate_climate_dataframe_from_nasapower_cloud_api(self, year_range, climate_variables, lat_range, lon_range, input_dir): """This function generates a dataframe containing (a) climate values (b) for every variable requested (c) for every day of the year (d) for every year passed in as argument. It will leverage NASAPOWER API to do it. Args: year_range (numpy.ndarray): a numpy array with all the years for which we are seeking data. climate_variables (str): the climate variable short name as per SILO nomenclature. For SILO check https://www.longpaddock.qld.gov.au/silo/about/climate-variables/. Variable names are automatically translated from SILO to NASAPOWER codes. lat_range (numpy.ndarray): a numpy array of latitude values to extract data from lon_range (numpy.ndarray): a numpy array of longitude values to extract data from input_dir (str): when selecting the option to generate Climate Data Files from local directories, this parameter must be specified, otherwise data will be fetched directly from the cloud either via an available API or S3 bucket. Returns: tuple: a tuple consisting of (a) the final dataframe containing values for all years, latitudes and longitudes for a particular climate variable, (b) the curated list of longitude ranges (which excludes all those lon values where there were no actual data points). The tuple is ordered as follows: (final_dataframe, final_lon_range) """ # We will iterate through each "latitude" value and, # within this loop, we will iterate through all the different # "longitude" values for a given year. Results for each year # are collected inside the "climate_df" with "climate_df.append" # At the end, it will output a file with all the contents if # "output_to_file=True" (by default it is "True") self.logger.debug('Generating DataFrames') # empty df to append all the climate_df to total_climate_df = pd.DataFrame() # create an empty list to keep track of lon coordinates # where there are no values empty_lon_coordinates = [] # Now iterating over lat and lon combinations # Each year-lat-lon matrix generates a different file for lat in tqdm(lat_range, file=sys.stdout, ascii=True, desc="Total Progress"): for lon in lon_range: if lon in empty_lon_coordinates: continue for climate_variable in climate_variables: self.logger.debug('Processing data for climate variable {}'.format(climate_variable)) # Loading and/or Downloading the files for year in year_range: self.logger.debug('Processing data for year {}'.format(year)) self.logger.debug('Processing Variable {} - Lat {} - Lon {} for Year {}'.format(climate_variable, lat, lon, year)) # here we are checking whether the get_values_from_cdf function # returns with a ValueError (meaning there were no values for # that particular lat & long combination). If it does return # with an error, we skip this loop and don't produce any output files try: var_year_lat_lon_df = self.get_yearly_data(lat, lon, None, year, year_range, climate_variable) except ValueError: self.logger.warning("This longitude value will be skipped for the rest of the climate variables and years") self.logger.warning("Deleting lon {} in array position {}".format(lon, np.where(lon_range == lon)[0][0])) # Append empty lon value to list empty_lon_coordinates.append(lon) continue # delete the var_year_lat_lon_df back to zero. total_climate_df = total_climate_df.append(var_year_lat_lon_df) del var_year_lat_lon_df # Remove any empty lon values from longitude array so as to avoid empty MET generation empty_lon_array = np.array(empty_lon_coordinates) final_lon_range = np.setdiff1d(lon_range, empty_lon_array) # Return results in a touple return (total_climate_df, final_lon_range)