HadISD Tutorial Four - Make Large Groupings#
NOTE Before beginning this tutorial, you should first read HadISD Tutorial One - Introduction to Station Data and ensure you complete the tutorials in order
[14]:
from pathlib import Path
import numpy as np
from datetime import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import xarray as xr
[28]:
# A spot to put the data on disk. We keep both the data as-downloaded and the reprocessed version, so you might need up to 50GB free in order to make this work.
PROCESSING_DIR = Path('/g/data/kd24/data') / 'hadisd' / 'processing' # We need to cache some data on disk during reprocessing
DECADAL_DIR = Path('/g/data/kd24/data') / 'hadisd' / 'by_decade' # This will hold the final form of our data
DECADAL_DIR.mkdir()
[29]:
decades = {
'early': ('1800', '1930'), # Just in case there is undocumented early data
'1930': ('1930', '1940'), # Dataset begins in 1930, start by decade here
'1940': ('1940', '1950'),
'1950': ('1950', '1960'),
'1960': ('1960', '1970'),
'1970': ('1970', '1980'),
'1980': ('1980', '1990'),
'1990': ('1990', '2000'),
'2000': ('2000', '2010'),
'2010': ('2010', '2020'),
'2020': ('2020', '2030')
}
[30]:
files_for_decades = {}
for ix in decades.keys():
start_dec, end_dec = decades[ix]
_files_for_decade = list(PROCESSING_DIR.glob(f'*{start_dec}-{end_dec}*.nc'))
files_for_decades[ix] = _files_for_decade
# Uncomment this to see values for debugging
# the1950s = files_for_decades['1950']
# the1950s
[31]:
decade_of_interest = '1990' # In the interests of saving time, we process only one decade here
files_for_decade = files_for_decades[decade_of_interest]
groupings = [files_for_decade[i:i + 40] for i in range(0, len(files_for_decade), 40)]
print(f"{len(groupings)} file groupings to be used for decade {decade_of_interest}")
for i, grouping in enumerate(groupings):
loaded = [xr.open_dataset(f) for f in grouping]
print(f"Loaded group {i}")
combined = xr.concat(loaded, dim='report', data_vars='all')
combined['reporting_stats'] = combined['reporting_stats'].fillna(-999.0)
print(f"Combined group {i}")
filename = f'all_{decade_of_interest}s_group{str(i)}.nc'
combined.to_netcdf(DECADAL_DIR / filename)
print(f"Wrote group {i}")
1 file groupings to be used for decade 1990
Loaded group 0
Combined group 0
Wrote group 0
[32]:
print("Completed")
Completed
[ ]: