HadISD Tutorial Four - Make Large Groupings

HadISD Tutorial Four - Make Large Groupings#

NOTE Before beginning this tutorial, you should first read HadISD Tutorial One - Introduction to Station Data and ensure you complete the tutorials in order

[14]:
from pathlib import Path
import numpy as np
from datetime import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import xarray as xr
[28]:
# A spot to put the data on disk. We keep both the data as-downloaded and the reprocessed version, so you might need up to 50GB free in order to make this work.

PROCESSING_DIR = Path('/g/data/kd24/data') / 'hadisd' / 'processing'   # We need to cache some data on disk during reprocessing
DECADAL_DIR = Path('/g/data/kd24/data') / 'hadisd' / 'by_decade'    # This will hold the final form of our data
DECADAL_DIR.mkdir()
[29]:
decades = {
    'early': ('1800', '1930'), # Just in case there is undocumented early data
    '1930': ('1930', '1940'),  # Dataset begins in 1930, start by decade here
    '1940': ('1940', '1950'),
    '1950': ('1950', '1960'),
    '1960': ('1960', '1970'),
    '1970': ('1970', '1980'),
    '1980': ('1980', '1990'),
    '1990': ('1990', '2000'),
    '2000': ('2000', '2010'),
    '2010': ('2010', '2020'),
    '2020': ('2020', '2030')
}
[30]:
files_for_decades = {}

for ix in decades.keys():
    start_dec, end_dec = decades[ix]
    _files_for_decade = list(PROCESSING_DIR.glob(f'*{start_dec}-{end_dec}*.nc'))
    files_for_decades[ix] = _files_for_decade

# Uncomment this to see values for debugging
# the1950s = files_for_decades['1950']
# the1950s
[31]:
decade_of_interest = '1990'  # In the interests of saving time, we process only one decade here

files_for_decade = files_for_decades[decade_of_interest]
groupings = [files_for_decade[i:i + 40] for i in range(0, len(files_for_decade), 40)]
print(f"{len(groupings)} file groupings to be used for decade {decade_of_interest}")
for i, grouping in enumerate(groupings):
    loaded = [xr.open_dataset(f) for f in grouping]
    print(f"Loaded group {i}")
    combined = xr.concat(loaded, dim='report', data_vars='all')
    combined['reporting_stats'] = combined['reporting_stats'].fillna(-999.0)
    print(f"Combined group {i}")
    filename = f'all_{decade_of_interest}s_group{str(i)}.nc'
    combined.to_netcdf(DECADAL_DIR / filename)
    print(f"Wrote group {i}")
1 file groupings to be used for decade 1990
Loaded group 0
Combined group 0
Wrote group 0
[32]:
print("Completed")
Completed
[ ]: