Skip to content

Commit faf3703

Browse files
committed
Better code for zarr
1 parent ca852f0 commit faf3703

File tree

3 files changed

+56
-29
lines changed

3 files changed

+56
-29
lines changed

src/py_eddy_tracker/observations/observation.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -397,9 +397,17 @@ def zarr_dimension(filename):
397397
return set(dims)
398398

399399
@classmethod
400-
def load_from_zarr(cls, filename, remove_vars=None, include_vars=None):
400+
def load_file(cls, filename, **kwargs):
401+
if filename.endswith('.zarr'):
402+
return cls.load_from_zarr(filename, **kwargs)
403+
else:
404+
return cls.load_from_netcdf(filename, **kwargs)
405+
406+
@classmethod
407+
def load_from_zarr(cls, filename, raw_data=False, remove_vars=None, include_vars=None):
401408
# FIXME must be investigate, in zarr no dimensions name (or could be add in attr)
402409
array_dim = 50
410+
BLOC = 5000000
403411
if not isinstance(filename, str):
404412
filename = filename.astype(str)
405413
h_zarr = zarr.open(filename)
@@ -428,13 +436,14 @@ def load_from_zarr(cls, filename, remove_vars=None, include_vars=None):
428436
continue
429437
if var_inv not in cls.ELEMENTS and var_inv not in array_variables:
430438
kwargs["track_extra_variables"].append(var_inv)
431-
kwargs["raw_data"] = False
439+
kwargs["raw_data"] = raw_data
432440
kwargs["only_variables"] = None if include_vars is None else [VAR_DESCR_inv[i] for i in include_vars]
433441
eddies = cls(size=nb_obs, **kwargs)
434442
for variable in var_list:
435443
var_inv = VAR_DESCR_inv[variable]
436444
if var_inv == "type_cyc":
437445
continue
446+
logging.debug('%s will be loaded', variable)
438447
# find unit factor
439448
factor = 1
440449
input_unit = h_zarr[variable].attrs.get('unit', None)
@@ -456,15 +465,22 @@ def load_from_zarr(cls, filename, remove_vars=None, include_vars=None):
456465
if factor != 1:
457466
logging.info('%s will be multiply by %f to take care of units(%s->%s)',
458467
variable, factor, input_unit, output_unit)
459-
if factor != 1:
460-
eddies.obs[var_inv] = h_zarr[variable][:] * factor
461-
else:
462-
eddies.obs[var_inv] = h_zarr[variable][:]
468+
nb = h_zarr[variable].shape[0]
469+
470+
scale_factor = VAR_DESCR[var_inv].get('scale_factor', None)
471+
add_offset = VAR_DESCR[var_inv].get('add_offset', None)
472+
for i in range(0, nb, BLOC):
473+
sl = slice(i, i + BLOC)
474+
data = h_zarr[variable][sl]
475+
if factor != 1:
476+
data *= factor
477+
if raw_data:
478+
if add_offset is not None:
479+
data -= add_offset
480+
if scale_factor is not None:
481+
data /= scale_factor
482+
eddies.obs[var_inv][sl] = data
463483

464-
# for variable in var_list:
465-
# var_inv = VAR_DESCR_inv[variable]
466-
# if var_inv == "type_cyc":
467-
# eddies.sign_type = h_zarr[variable][0]
468484
eddies.sign_type = h_zarr.attrs.get("rotation_type", 0)
469485
if eddies.sign_type == 0:
470486
logging.debug("File come from another algorithm of identification")

src/scripts/EddySubSetter

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,29 @@ def id_parser():
1212
parser = EddyParser('Eddy Identification')
1313
parser.add_argument('filename')
1414
parser.add_argument('filename_out')
15-
parser.add_argument('-p', '--period', nargs=2, type=int,
16-
help='Start day and end day, if it s negative value we will add to day min and add to day max, if 0 it s not use')
17-
parser.add_argument('-l', '--length', nargs=2, type=int,
18-
help='Minimal and maximal quantity of observation for one track, ones bounds could be negative, it will be not use')
19-
parser.add_argument('-f', '--full_path', action='store_true',
20-
help='Extract path, if one obs or more are selected')
21-
parser.add_argument('-d', '--remove_incomplete', action='store_true',
22-
help='Extract path only if all obs are selected')
23-
parser.add_argument('--reject_virtual', action='store_true',
24-
help="If there are only virtual observation in selection, we don't select track")
25-
parser.add_argument('-a', '--area', nargs=4, type=float,
26-
metavar=('llcrnrlon', 'llcrnrlat', 'urcrnrlon', 'urcrnrlat'),
27-
help='Coordinates of bounding to extract'
28-
)
29-
parser.add_argument('--remove_var', nargs='+', type=str, help='remove all listed variable')
30-
parser.add_argument('--include_var', nargs='+', type=str, help='use only listed variable, remove_var will be ignored')
31-
parser.add_argument('-i', '--ids', nargs='+', type=int, help='List of tracks which will be extract')
15+
16+
group = parser.add_argument_group('Extraction options')
17+
group.add_argument('-p', '--period', nargs=2, type=int,
18+
help='Start day and end day, if it s negative value we will add to day min and add to day max, if 0 it s not use')
19+
group.add_argument('-l', '--length', nargs=2, type=int,
20+
help='Minimal and maximal quantity of observation for one track, ones bounds could be negative, it will be not use')
21+
group.add_argument('-f', '--full_path', action='store_true',
22+
help='Extract path, if one obs or more are selected')
23+
group.add_argument('-d', '--remove_incomplete', action='store_true',
24+
help='Extract path only if all obs are selected')
25+
group.add_argument('--reject_virtual', action='store_true',
26+
help="If there are only virtual observation in selection, we don't select track")
27+
group.add_argument('-a', '--area', nargs=4, type=float,
28+
metavar=('llcrnrlon', 'llcrnrlat', 'urcrnrlon', 'urcrnrlat'),
29+
help='Coordinates of bounding to extract'
30+
)
31+
group.add_argument('--remove_var', nargs='+', type=str, help='remove all listed variable')
32+
group.add_argument('--include_var', nargs='+', type=str, help='use only listed variable, remove_var will be ignored')
33+
group.add_argument('-i', '--ids', nargs='+', type=int, help='List of tracks which will be extract')
34+
35+
group = parser.add_argument_group('Extraction options')
36+
group.add_argument('--sort_time', action='store_true', help='sort all observation with time')
37+
3238
parser.add_argument('-n', '--no_raw_mode', action='store_true',
3339
help='Uncompress all data, could be create a memory error for huge file, but is safer for extern file of py eddy tracker')
3440
return parser
@@ -38,7 +44,7 @@ if __name__ == '__main__':
3844
args = id_parser().parse_args()
3945

4046
# Original dataset
41-
dataset = TrackEddiesObservations.load_from_netcdf(
47+
dataset = TrackEddiesObservations.load_file(
4248
args.filename,
4349
raw_data=False if args.no_raw_mode else True,
4450
remove_vars=args.remove_var,
@@ -70,6 +76,11 @@ if __name__ == '__main__':
7076
remove_incomplete=args.remove_incomplete,
7177
reject_virtual=args.reject_virtual)
7278

79+
if args.sort_time:
80+
logging.debug('start sorting ...')
81+
dataset.obs.sort(order=['time', 'lon', 'lat'])
82+
logging.debug('end sorting')
83+
7384
# if no data, no output will be written
7485
if len(dataset) == 0:
7586
logging.warning("No data are selected, out file couldn't be create")

src/scripts/EddyTracking

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ if __name__ == '__main__':
184184
NB_OBS_MIN = int(CONFIG.get('TRACK_DURATION_MIN', 14))
185185
CORRESPONDANCES.prepare_merging()
186186

187-
logging.info('The longest tracks have %d observations', CORRESPONDANCES.nb_obs_by_tracks.max())
187+
logging.info('Longer track saved have %d obs', CORRESPONDANCES.nb_obs_by_tracks.max())
188188
logging.info('The mean length is %d observations before filtering', CORRESPONDANCES.nb_obs_by_tracks.mean())
189189

190190
CORRESPONDANCES.get_unused_data().write_file(path=SAVE_DIR, filename='%(path)s/%(sign_type)s_untracked.nc', zarr_flag=ZARR)

0 commit comments

Comments
 (0)