invalid jinja templates for a bigger number of files
See original GitHub issueWhen trying to open a combined JSON file for more than 1908 netcdf files with more than 5 variables (or when setting templates_count
to 0 in MultiZarrToZarr.translate
), fsspec
raises a jinja
error. For a script to reproduce this and the actual traceback:
script to generate, convert and combine netcdf files
import pathlib
import dask
import dask.diagnostics
import fsspec
import numpy as np
import ujson
import xarray as xr
from fsspec_reference_maker.combine import MultiZarrToZarr
from fsspec_reference_maker.hdf import SingleHdf5ToZarr
def gen_json(u, root):
so = {"mode": "rb"} # add storage options for other filesystem implementations
with fsspec.open(u, **so) as inf:
h5chunks = SingleHdf5ToZarr(inf, u, inline_threshold=300)
with open(root / f"{pathlib.Path(u).name}.json", "wb") as outf:
outf.write(ujson.dumps(h5chunks.translate()).encode())
nc_root = pathlib.Path("nc")
nc_root.mkdir(exist_ok=True)
def create_file(n):
ds = xr.Dataset(
data_vars={
"a": (("x", "y"), np.ones((100, 1))),
"b": (("y", "z"), np.zeros((1, 200))),
},
coords={
"y": [n],
},
).chunk({"x": 10})
ds.to_netcdf(nc_root / f"{n:04d}.nc")
with dask.diagnostics.ProgressBar():
_ = dask.compute(*[dask.delayed(create_file)(i) for i in range(1909)])
paths = sorted(nc_root.iterdir())
# create a directory to store the datas
separate_root = pathlib.Path("separate")
separate_root.mkdir(exist_ok=True)
# create a directory to store the individual datas
with dask.diagnostics.ProgressBar():
_ = dask.compute(*[dask.delayed(gen_json)(str(p), separate_root) for p in paths])
def combine_json(paths, outpath):
mzz = MultiZarrToZarr(
paths,
remote_protocol="file",
xarray_open_kwargs={
"decode_cf": False,
"mask_and_scale": False,
"decode_times": False,
"decode_timedelta": False,
"use_cftime": False,
"decode_coords": False,
},
xarray_concat_args={
"combine_attrs": "override",
"data_vars": "minimal",
"coords": "minimal",
"compat": "override",
# "concat_dim": "time",
"dim": "time",
},
)
mzz.translate(outpath, template_count=1)
json_paths = sorted(separate_root.iterdir())
combined_root = pathlib.Path("combined")
combined_root.mkdir(exist_ok=True)
path_1908 = combined_root / "1908.json"
path_1909 = combined_root / "1909.json"
print("combining ... ", end='')
combine_json(json_paths[:1908], path_1908)
combine_json(json_paths, path_1909)
print("done")
mapper_1908 = fsspec.get_mapper(
"reference://", fo=f"file://{path_1908.absolute()}", remote_protocol="file"
)
print("1908 files:", mapper_1908)
mapper_1909 = fsspec.get_mapper(
"reference://", fo=f"file://{path_1909.absolute()}", remote_protocol="file"
)
print("1909 files:", mapper_1909)
traceback
Traceback (most recent call last):
File ".../test.py", line 86, in <module>
mapper_1909 = fsspec.get_mapper("reference://", fo=f"file://{path_1909.absolute()}", remote_protocol="file")
File ".../fsspec/fsspec/mapping.py", line 228, in get_mapper
fs, urlpath = url_to_fs(url, **kwargs)
File ".../fsspec/fsspec/core.py", line 399, in url_to_fs
fs = cls(**options)
File ".../fsspec/fsspec/spec.py", line 76, in __call__
obj = super().__call__(*args, **kwargs)
File ".../fsspec/fsspec/implementations/reference.py", line 142, in __init__
self._process_references(text, template_overrides)
File ".../fsspec/fsspec/implementations/reference.py", line 259, in _process_references
self._process_references1(references, template_overrides=template_overrides)
File ".../fsspec/fsspec/implementations/reference.py", line 300, in _process_references1
u = _render_jinja(u)
File ".../fsspec/fsspec/implementations/reference.py", line 283, in _render_jinja
return jinja2.Template(u).render(**self.templates)
File "~/.conda/envs/fsspec/lib/python3.9/site-packages/jinja2/environment.py", line 1208, in __new__
return env.from_string(source, template_class=cls)
File "~/.conda/envs/fsspec/lib/python3.9/site-packages/jinja2/environment.py", line 1092, in from_string
return cls.from_code(self, self.compile(source), gs, None)
File "~/.conda/envs/fsspec/lib/python3.9/site-packages/jinja2/environment.py", line 757, in compile
self.handle_exception(source=source_hint)
File "~/.conda/envs/fsspec/lib/python3.9/site-packages/jinja2/environment.py", line 925, in handle_exception
raise rewrite_traceback_stack(source=source)
File "<unknown>", line 1, in template
jinja2.exceptions.TemplateSyntaxError: expected token 'end of print statement', got 'integer'
A colleague traced this to the generation of the template ids in https://github.com/intake/fsspec-reference-maker/blob/fda8a5318e307c23ef656257c0bc78262bcb9a6f/fsspec_reference_maker/combine.py#L131-L138 My conclusion is that jinja
requires those ids to be valid python identifiers, but the 1909th template gets a id of 01
(there are also single digit ids earlier, but those are fine, apparently).
Side note: I don’t understand why it uses itertools.combinations(..., i)
and not itertools.product(..., repeat=i)
, which would allow more values with fewer characters
To fix this here I’d either filter out invalid identifiers:
for tup in itertools.combinations(string.ascii_letters + string.digits, i):
if tup[0].isdigit():
continue
yield "".join(tup)
or always prefix the id with a character (this results in a bigger file size, though):
for tup in itertools.combinations(string.ascii_letters + string.digits, i):
yield "i" + "".join(tup)
but I may be missing a better way. In any case, both fix my issue.
If any of these sound good we can send in a PR.
Issue Analytics
- State:
- Created 2 years ago
- Comments:5 (5 by maintainers)
Top GitHub Comments
Yes. In theory, if using jinja, you could make a more complex expression, and the spec allows for this, but none of the references we generate make use of anything ecept a simple replace.
I did notice that, but I didn’t know how important the templates really are, or if they influenced the open time. Are they only used to reduce the filesize?
Edit: and thanks for the tip, I will try compressing the resulting json file