SEPTREFHUM.py

#!/usr/bin/python3

"""
    Copyright (C) 2023 NIBIO <https://www.nibio.no/>.

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

# Requirements:
# * CDO >= v 2.0
# * GDAL >= v 3.4.3 built with Python support
# * For Python: See requirements.txt

import os, sys, subprocess,glob
from dotenv import load_dotenv
from datetime import datetime, timezone, timedelta
from jinja2 import Environment, FileSystemLoader
import pytz
import netCDF4 as nc
import configparser
import logging

# Paths config
# Create a .env file from dotenv-sample
load_dotenv()
# Get language stuff
config = configparser.ConfigParser()
config.read("SEPTREFHUM.cfg")

# Paths to weather data
model_id = os.getenv("MODEL_ID")
infile_path  = os.getenv("WEATHER_DATA_DIR")
infile_archive_path  = os.getenv("ARCHIVE_WEATHER_DATA_DIR")
# Used for iterating the weather data files
filename_pattern = os.getenv("FILENAME_PATTERN")
# Date format of weather data filenames
filename_dateformat = os.getenv("FILENAME_DATEFORMAT")
# Path to store generated GeoTIFF files
outfile_path = os.getenv("DATA_DIR")
# Names of weather parameters in NetCDF files
RR = os.getenv("RR")
UM = os.getenv("UM")
TM = os.getenv("TM")

local_timezone = pytz.timezone(os.getenv("LOCAL_TIMEZONE"))
today = datetime.now(local_timezone)
if len(sys.argv) > 1:
    year = int(sys.argv[1])
else:
    year = today.year

# Where to store intermediary calculations
tmpfile_path = f"tmp/{year}/"

TEMPERATURE_THRESHOLD = 8.0

DEBUG = False if os.getenv("DEBUG") is None or os.getenv("DEBUG").lower() == "false" else True
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG if DEBUG else logging.INFO,
    format="%(asctime)s - %(levelname).4s - (%(filename)s:%(lineno)d) - %(message)s",
)

# Iterate the set of hourly weather data files
# 1. When's the latest wh_[DATE].nc file? - set earliest weather data file: start_date = [DATE]-2 days
# --> If no wh_DATE.nc file exists, set start_date = None
start_date = None
last_wh_date = None
#"""
for wh_file in glob.glob(f"{tmpfile_path}wh_2[0-9][0-9][0-9]-[01][0-9]-[0123][0-9].nc"):
    current_wh_file_date = local_timezone.localize(datetime.strptime(f"{wh_file[12:22]}", "%Y-%m-%d"))
    if last_wh_date is None or last_wh_date < current_wh_file_date:
       last_wh_date = current_wh_file_date
if last_wh_date is not None:
    start_date = last_wh_date - timedelta(days=2)
    logging.info(f"Last date of WH calculations is {last_wh_date}. Start date = {start_date}")

weatherdata_files = glob.glob(f"{infile_path}{filename_pattern}")
if DEBUG:
    logging.info(f"{infile_path}{filename_pattern}")
    logging.info("What are the weatherdata files?")
    logging.info(weatherdata_files)
for file_path in sorted(weatherdata_files):
    # TODO: When filename/pattern is configurable: make the string search adaptable
    file_name = os.path.basename(file_path)
    # Skip if we don't have a complete date, which could indicate that we are looking at a yearly file of daily data
    try:
        wh_sum_date = local_timezone.localize(datetime.strptime(file_name, filename_dateformat))
    except ValueError as e:
        logging.info(e)
        continue

    # Only process files from the three last days (if this is not a work from scratch)
    if start_date is not None and wh_sum_date < start_date:
         continue

    # Check that the file has at least 23 timesteps
    with nc.Dataset(file_path, 'r') as weatherdata_file:
        file_timesteps = len(weatherdata_file.variables["time"])
        if  file_timesteps < 23:
            logging.info(f"{file_path} has {file_timesteps} timesteps. Skipping it.")
            continue

    # Produce daily files with WH_SUM, which is the number of "Wet hours" (WH) for a given day
    # WH is defined as RR > 0.2 || UM > 88.0
    wh_sum_date_str = wh_sum_date.strftime("%Y-%m-%d")
    wh_sum_hour_str = wh_sum_date.strftime("%H")
    subprocess.run(
        f'cdo -s -O -setdate,{wh_sum_date_str} -settime,{wh_sum_hour_str}:00:00 -chname,WH,WH_DAYSUM -timsum -selname,WH -aexpr,"WH = ({TM} >= {TEMPERATURE_THRESHOLD} && ({RR} > 0.2 || {UM} > 88.0)) ? 1 : 0;" {file_path} {tmpfile_path}wh_{wh_sum_date_str}.nc',
        shell=True
        )

# Concatenate daily files > one file with daily values
# Additive calculation - this file can be stored between calculations
subprocess.run(f'cdo -s -O mergetime {tmpfile_path}wh_2[0-9][0-9][0-9]-[01][0-9]-[0123][0-9].nc {tmpfile_path}wh_daysum.nc', shell=True)

# Check timesteps - that we have no missing data
wh_daysum = nc.Dataset(f'{tmpfile_path}wh_daysum.nc', 'r')
timesteps = wh_daysum.variables["time"][:]
previous_timestep = None
for timestep in timesteps:
    if previous_timestep is not None:
        if timestep - previous_timestep != 86400.0:
            timestep_str = datetime.fromtimestamp(timestep).astimezone(local_timezone).strftime("%Y-%m-%d")
            previous_timestep_str = datetime.fromtimestamp(previous_timestep).astimezone(local_timezone).strftime("%Y-%m-%d")
            logging.info(f"ERROR: Missing weather data between {previous_timestep_str} and {timestep_str}. Exiting.", file=sys.stderr)
            exit(1)
    previous_timestep = timestep
wh_daysum.close()

# From here, no additive calculation; we calculate from the beginning of the w_daysum.nc file every time.

# Add sum of WH_DAYSUM[yesterday] + WH_DAYSUM[today] + WH_DAYSUM[tomorrow] into WHS[today]
# timselsum skips every 3 steps when summing 3 timestemps, so we must
# create three different files and then merge them
subprocess.run(f'cdo -s timselsum,3,0 {tmpfile_path}wh_daysum.nc {tmpfile_path}wh_3daysum_tmp_0.nc', shell=True)
subprocess.run(f'cdo -s timselsum,3,1 {tmpfile_path}wh_daysum.nc {tmpfile_path}wh_3daysum_tmp_1.nc', shell=True)
subprocess.run(f'cdo -s timselsum,3,2 {tmpfile_path}wh_daysum.nc {tmpfile_path}wh_3daysum_tmp_2.nc', shell=True)

subprocess.run(f'cdo -s -chname,WH_DAYSUM,WHS -mergetime {tmpfile_path}wh_3daysum_tmp_*.nc {tmpfile_path}wh_3daysum_tmp_merged.nc', shell=True)

# the last timesteps are most likely wrong, due to lack of "tomorrows" when performing timselsum
# To remove the last ones:
#
# The variable time_bnds (which has two dimensions: time and bnds. bnds is size 2) contains the
# time (in seconds) between the first and last timesteps used for summing up. e.g like this:
# $ ncdump -v time_bnds tmp/wh_3daysum.nc
# time_bnds =
# 1696111200, 1696284000,
# 1696197600, 1696370400,
# 1696284000, 1696456800,
# 1696370400, 1696543200,
# 1696456800, 1696629600,
# 1696543200, 1696716000,
# 1696629600, 1696716000,
# 1696716000, 1696716000;
#}
# The difference [1] - [0] should be 172800 seconds = 48 hours
# Timesteps with [1] - [0] != 172800 should be discarded
# Using netCDF4 to accomplish this

wh_3daysum = nc.Dataset(f'{tmpfile_path}wh_3daysum_tmp_merged.nc', 'r')
time_bnds = wh_3daysum.variables["time_bnds"][:]
# Assuming that wrong time bounds only exist at the end of the time series, this works
number_of_timesteps_to_remove = 0
for time_bnd in time_bnds:
    if time_bnd[1]-time_bnd[0] != 172800.0:
        number_of_timesteps_to_remove = number_of_timesteps_to_remove + 1
wh_3daysum.close()
number_of_timesteps_to_keep = len(time_bnds) - number_of_timesteps_to_remove
subprocess.run(f'cdo -s -seltimestep,1/{number_of_timesteps_to_keep} {tmpfile_path}wh_3daysum_tmp_merged.nc {tmpfile_path}wh_3daysum.nc', shell=True)


# Classifying warning status for the WHS model
# 0  =<  WHS <  20      --> Green
# 20 <= WHS < 40       --> Yellow
# 40 <= WHS            --> Red
subprocess.run(f'cdo -s -aexpr,"WARNING_STATUS = WHS < 0 ? 0 : -1; WARNING_STATUS = WHS < 20 && WARNING_STATUS == -1 ? 2 : WARNING_STATUS; WARNING_STATUS = WHS < 40 && WARNING_STATUS == -1 ? 3 : WARNING_STATUS; WARNING_STATUS = WHS >= 40 && WARNING_STATUS == -1 ? 4 : WARNING_STATUS" {tmpfile_path}wh_3daysum.nc {tmpfile_path}result_unmasked.nc', shell=True)

subprocess.run(f"rm {tmpfile_path}result.nc", shell=True)

# Mask results using a CSV file with polygons
# Env variable MASK_FILE must be set
if os.getenv("MASK_FILE") is not None:
    mask_file = os.getenv("MASK_FILE")
    logging.info(f"Applying mask file {mask_file} to result.nc")
    subprocess.run(f'cdo -P 6 -maskregion,{mask_file} {tmpfile_path}result_unmasked.nc {tmpfile_path}result.nc', shell=True)
else:
    os.rename(f"{tmpfile_path}result_unmasked.nc", f"{tmpfile_path}result.nc")
#"""

# Split the combined file into daily .nc files again, with YYYY-MM-DD in the filename. Convert to corresponding GeoTIFF files
# Variables that needs discrete classification, must be integers in order for mapserver to work properly (Don't ask!)
# Since we need WARNING_STATUS to discretely classified, we need to create a separate GeoTIFF file for it
wh_3daysum = nc.Dataset(f'{tmpfile_path}wh_3daysum.nc', 'r')
timesteps = wh_3daysum.variables["time"][:]
timestep_index = 1
timestep_dates = [] # Used in the mapfile template
# TODO: We are (?) assuming all timesteps are equal. This might NOT be the case if there are holes in the result data
for timestep in timesteps:
    timestep_date = datetime.fromtimestamp(timestep).astimezone(local_timezone)
    file_date = timestep_date.astimezone(local_timezone).strftime("%Y-%m-%d")
    timestep_dates.append(file_date)

    # Create NetCDF result file
    subprocess.run(f'cdo -s -seltimestep,{timestep_index}/{timestep_index} {tmpfile_path}result.nc {tmpfile_path}result_{file_date}.nc', shell=True)
    # Convert to GeoTIFF
    # We only need WHS and WARNING_STATUS
    # Merge the WARNING_STATUS and WHS variables into one GeoTIFF file with two bands.
    # The WARNING_STATUS should always be band #1
    # We must delete the GeoTIFF file before merging
    subprocess.run(f'rm {outfile_path}result_*{file_date}*.tif', shell=True)
    with open("/dev/null", "w") as devnull:
        subprocess.run(f'gdal_translate -ot Int16 -of GTiff  NETCDF:"{tmpfile_path}result_{file_date}.nc":WARNING_STATUS {tmpfile_path}result_WARNING_STATUS_{file_date}_lcc.tif', shell=True, stdout=devnull)
        subprocess.run(f'gdal_translate -ot Float32 -of GTiff  NETCDF:"{tmpfile_path}result_{file_date}.nc":WHS {tmpfile_path}result_{file_date}_lcc.tif', shell=True, stdout=devnull)
        # Need to reproject the files, to ensure we have the projection given in the generted mapfile. We always use EPSG:4326 for this
        subprocess.run(f'gdalwarp -t_srs EPSG:4326 {tmpfile_path}result_WARNING_STATUS_{file_date}_lcc.tif {outfile_path}result_WARNING_STATUS_{file_date}.tif', shell=True, stdout=devnull)
        subprocess.run(f'gdalwarp -t_srs EPSG:4326 {tmpfile_path}result_{file_date}_lcc.tif {outfile_path}result_{file_date}.tif', shell=True, stdout=devnull)

    timestep_index = timestep_index + 1

# Generate mapfile
# Building data sets for language specific legends
languages = []
language_codes = config["i18n"]["languages"].split(",");
for language_code in language_codes:
     language = {"language_code": language_code}
     if ("i18n.%s" % language_code) in config:
        for keyword in config["i18n.%s" % language_code]:
            language[keyword] = config["i18n.%s" % language_code][keyword]
        languages.append(language)

# The paths should be set in a .env file
env = Environment(loader=FileSystemLoader('.'))
template = env.get_template("mapfile/template.j2")
output = template.render({
     "model_id":model_id,
     "timestep_dates": timestep_dates,
     "mapserver_data_dir": os.getenv("MAPSERVER_DATA_DIR"),
     "mapserver_mapfile_dir": os.getenv("MAPSERVER_MAPFILE_DIR"),
     "mapserver_log_file": os.getenv("MAPSERVER_LOG_FILE"),
     "mapserver_image_path": os.getenv("MAPSERVER_IMAGE_PATH"),
     "mapserver_extent": os.getenv("MAPSERVER_EXTENT"),
     "languages": languages,
     "language_codes": language_codes
})
mapfile_outdir = os.getenv("MAPFILE_DIR")
with open(f"{mapfile_outdir}/{model_id}.map", 'w') as f:
        f.write(output)


# Remove all temporary/intermediary files
subprocess.run(f"rm {tmpfile_path}wh_3daysum*.nc", shell=True)
subprocess.run(f"rm {tmpfile_path}result_*.nc", shell=True)
subprocess.run(f"rm {tmpfile_path}result*.tif", shell=True)