GPCP Rechunk

GPCP Rechunk#

# Example recipe to demonstate reading from an existing Zarr store and
# writing a new Zarr store with a differant chunking structure


import apache_beam as beam
import zarr

from pangeo_forge_recipes.patterns import FileType, pattern_from_file_sequence
from pangeo_forge_recipes.transforms import (
    ConsolidateDimensionCoordinates,
    ConsolidateMetadata,
    OpenWithXarray,
    StoreToZarr,
)

pattern = pattern_from_file_sequence(
    ["https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr"],
    concat_dim="time",
)


def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore:
    import xarray as xr

    assert xr.open_dataset(store, engine="zarr", chunks={})
    return store


recipe = (
    beam.Create(pattern.items())
    | OpenWithXarray(file_type=FileType("zarr"), xarray_open_kwargs={"chunks": {}})
    | StoreToZarr(
        store_name="gpcp_rechunked.zarr",
        target_chunks={"time": 9226, "latitude": 16, "longitude": 36, "nv": 2},
        combine_dims=pattern.combine_dim_keys,
    )
    | ConsolidateDimensionCoordinates()
    | ConsolidateMetadata()
    | "Test dataset" >> beam.Map(test_ds)
)