15
15
16
16
use std:: path:: PathBuf ;
17
17
18
- use anyhow:: Result ;
18
+ use anyhow:: { Result , anyhow } ;
19
19
use datafusion:: arrow:: record_batch:: RecordBatch ;
20
20
use heck:: ToSnakeCase ;
21
21
use itertools:: Itertools ;
@@ -26,10 +26,8 @@ use nautilus_model::data::{
26
26
} ;
27
27
use nautilus_serialization:: {
28
28
arrow:: { DecodeDataFromRecordBatch , EncodeToRecordBatch } ,
29
- parquet:: {
30
- ParquetWriteMode , combine_data_files, min_max_from_parquet_metadata,
31
- write_batches_to_parquet,
32
- } ,
29
+ enums:: ParquetWriteMode ,
30
+ parquet:: { combine_data_files, min_max_from_parquet_metadata, write_batches_to_parquet} ,
33
31
} ;
34
32
use serde:: Serialize ;
35
33
@@ -87,38 +85,34 @@ impl ParquetDataCatalog {
87
85
let _ = self . write_to_parquet ( bar, None , None , None , write_mode) ;
88
86
}
89
87
90
- #[ must_use]
91
88
pub fn write_to_parquet < T > (
92
89
& self ,
93
90
data : Vec < T > ,
94
91
path : Option < PathBuf > ,
95
92
compression : Option < parquet:: basic:: Compression > ,
96
93
max_row_group_size : Option < usize > ,
97
94
write_mode : Option < ParquetWriteMode > ,
98
- ) -> PathBuf
95
+ ) -> Result < PathBuf >
99
96
where
100
97
T : GetTsInit + EncodeToRecordBatch + CatalogPathPrefix ,
101
98
{
102
99
let type_name = std:: any:: type_name :: < T > ( ) . to_snake_case ( ) ;
103
100
Self :: check_ascending_timestamps ( & data, & type_name) ;
104
-
105
- let batches = self . data_to_record_batches ( data) ;
106
- let batch = batches. first ( ) . expect ( "Expected at least one batch" ) ;
107
- let schema = batch. schema ( ) ;
108
- let instrument_id = schema. metadata . get ( "instrument_id" ) ;
109
- let path =
110
- path. unwrap_or_else ( || self . make_path ( T :: path_prefix ( ) , instrument_id, write_mode) ) ;
101
+ let batches = self . data_to_record_batches ( data) ?;
102
+ let schema = batches. first ( ) . expect ( "Batches are empty." ) . schema ( ) ;
103
+ let instrument_id = schema. metadata . get ( "instrument_id" ) . cloned ( ) ;
104
+ let new_path = self . make_path ( T :: path_prefix ( ) , instrument_id, write_mode) ?;
105
+ let path = path. unwrap_or ( new_path) ;
111
106
112
107
// Write all batches to parquet file
113
108
info ! (
114
109
"Writing {} batches of {type_name} data to {path:?}" ,
115
110
batches. len( )
116
111
) ;
117
112
118
- write_batches_to_parquet ( & batches, & path, compression, max_row_group_size, write_mode)
119
- . unwrap_or_else ( |_| panic ! ( "Failed to write {type_name} to parquet" ) ) ;
113
+ write_batches_to_parquet ( & batches, & path, compression, max_row_group_size, write_mode) ?;
120
114
121
- path
115
+ Ok ( path)
122
116
}
123
117
124
118
fn check_ascending_timestamps < T : GetTsInit > ( data : & [ T ] , type_name : & str ) {
@@ -128,71 +122,77 @@ impl ParquetDataCatalog {
128
122
) ;
129
123
}
130
124
131
- #[ must_use]
132
- pub fn data_to_record_batches < T > ( & self , data : Vec < T > ) -> Vec < RecordBatch >
125
+ pub fn data_to_record_batches < T > ( & self , data : Vec < T > ) -> Result < Vec < RecordBatch > >
133
126
where
134
127
T : GetTsInit + EncodeToRecordBatch ,
135
128
{
136
- data. into_iter ( )
137
- . chunks ( self . batch_size )
138
- . into_iter ( )
139
- . map ( |chunk| {
140
- // Take first element and extract metadata
141
- // SAFETY: Unwrap safe as already checked that `data` not empty
142
- let data = chunk. collect_vec ( ) ;
143
- let metadata = EncodeToRecordBatch :: chunk_metadata ( & data) ;
144
- T :: encode_batch ( & metadata, & data) . expect ( "Expected to encode batch" )
145
- } )
146
- . collect ( )
129
+ let mut batches = Vec :: new ( ) ;
130
+
131
+ for chunk in & data. into_iter ( ) . chunks ( self . batch_size ) {
132
+ let data = chunk. collect_vec ( ) ;
133
+ let metadata = EncodeToRecordBatch :: chunk_metadata ( & data) ;
134
+ let record_batch = T :: encode_batch ( & metadata, & data) ?;
135
+ batches. push ( record_batch) ;
136
+ }
137
+
138
+ Ok ( batches)
147
139
}
148
140
149
141
fn make_path (
150
142
& self ,
151
143
type_name : & str ,
152
- instrument_id : Option < & String > ,
144
+ instrument_id : Option < String > ,
153
145
write_mode : Option < ParquetWriteMode > ,
154
- ) -> PathBuf {
146
+ ) -> Result < PathBuf > {
147
+ let path = self . make_directory_path ( type_name, instrument_id) ;
148
+ std:: fs:: create_dir_all ( & path) ?;
155
149
let used_write_mode = write_mode. unwrap_or ( ParquetWriteMode :: Overwrite ) ;
156
- let mut path = self . base_path . join ( "data" ) . join ( type_name) ;
150
+ let mut file_path = path. join ( "data-0.parquet" ) ;
151
+ let mut empty_path = file_path. clone ( ) ;
152
+ let mut i = 0 ;
157
153
158
- if let Some ( id) = instrument_id {
159
- path = path. join ( id) ;
154
+ while empty_path. exists ( ) {
155
+ i += 1 ;
156
+ let name = format ! ( "data-{i}.parquet" ) ;
157
+ empty_path = path. join ( name) ;
160
158
}
161
159
162
- std:: fs:: create_dir_all ( & path) . expect ( "Failed to create directory" ) ;
163
- let mut file_path = path. join ( "data-0.parquet" ) ;
160
+ if i > 1 && used_write_mode != ParquetWriteMode :: NewFile {
161
+ return Err ( anyhow ! (
162
+ "Only ParquetWriteMode::NewFile is allowed for a directory containing several parquet files."
163
+ ) ) ;
164
+ } else if used_write_mode == ParquetWriteMode :: NewFile {
165
+ file_path = empty_path;
166
+ }
167
+
168
+ info ! ( "Created directory path: {file_path:?}" ) ;
164
169
165
- if used_write_mode == ParquetWriteMode :: NewFile {
166
- let mut i = 0 ;
170
+ Ok ( file_path )
171
+ }
167
172
168
- while file_path . exists ( ) {
169
- i += 1 ;
170
- let name = format ! ( "data-{i}.parquet" ) ;
171
- file_path = path . join ( name ) ;
172
- }
173
+ fn make_directory_path ( & self , type_name : & str , instrument_id : Option < String > ) -> PathBuf {
174
+ let mut path = self . base_path . join ( "data" ) . join ( type_name ) ;
175
+
176
+ if let Some ( id ) = instrument_id {
177
+ path = path . join ( id . replace ( '/' , "" ) ) ; // for FX symbols like EUR/USD
173
178
}
174
179
175
- info ! ( "Created directory path: {file_path:?}" ) ;
176
- file_path
180
+ path
177
181
}
178
182
179
- #[ must_use]
180
183
pub fn write_to_json < T > (
181
184
& self ,
182
185
data : Vec < T > ,
183
186
path : Option < PathBuf > ,
184
187
write_metadata : bool ,
185
- ) -> PathBuf
188
+ ) -> Result < PathBuf >
186
189
where
187
190
T : GetTsInit + Serialize + CatalogPathPrefix + EncodeToRecordBatch ,
188
191
{
189
192
let type_name = std:: any:: type_name :: < T > ( ) . to_snake_case ( ) ;
190
193
Self :: check_ascending_timestamps ( & data, & type_name) ;
191
-
192
- let json_path = path. unwrap_or_else ( || {
193
- let path = self . make_path ( T :: path_prefix ( ) , None , None ) ;
194
- path. with_extension ( "json" )
195
- } ) ;
194
+ let new_path = self . make_path ( T :: path_prefix ( ) , None , None ) ?;
195
+ let json_path = path. unwrap_or ( new_path. with_extension ( "json" ) ) ;
196
196
197
197
info ! (
198
198
"Writing {} records of {type_name} data to {json_path:?}" ,
@@ -203,22 +203,17 @@ impl ParquetDataCatalog {
203
203
let metadata = T :: chunk_metadata ( & data) ;
204
204
let metadata_path = json_path. with_extension ( "metadata.json" ) ;
205
205
info ! ( "Writing metadata to {metadata_path:?}" ) ;
206
- let metadata_file = std:: fs:: File :: create ( & metadata_path)
207
- . unwrap_or_else ( |_| panic ! ( "Failed to create metadata file at {metadata_path:?}" ) ) ;
208
- serde_json:: to_writer_pretty ( metadata_file, & metadata)
209
- . unwrap_or_else ( |_| panic ! ( "Failed to write metadata to JSON" ) ) ;
206
+ let metadata_file = std:: fs:: File :: create ( & metadata_path) ?;
207
+ serde_json:: to_writer_pretty ( metadata_file, & metadata) ?;
210
208
}
211
209
212
- let file = std:: fs:: File :: create ( & json_path)
213
- . unwrap_or_else ( |_| panic ! ( "Failed to create JSON file at {json_path:?}" ) ) ;
214
-
215
- serde_json:: to_writer_pretty ( file, & serde_json:: to_value ( data) . unwrap ( ) )
216
- . unwrap_or_else ( |_| panic ! ( "Failed to write {type_name} to JSON" ) ) ;
210
+ let file = std:: fs:: File :: create ( & json_path) ?;
211
+ serde_json:: to_writer_pretty ( file, & serde_json:: to_value ( data) ?) ?;
217
212
218
- json_path
213
+ Ok ( json_path)
219
214
}
220
215
221
- pub fn consolidate_data ( & self , type_name : & str , instrument_id : Option < & String > ) -> Result < ( ) > {
216
+ pub fn consolidate_data ( & self , type_name : & str , instrument_id : Option < String > ) -> Result < ( ) > {
222
217
let parquet_files = self . query_parquet_files ( type_name, instrument_id) ?;
223
218
224
219
if !parquet_files. is_empty ( ) {
@@ -273,7 +268,6 @@ impl ParquetDataCatalog {
273
268
let entry = entry. unwrap ( ) ;
274
269
entry. path ( ) . is_dir ( )
275
270
} ) ;
276
-
277
271
let has_files = std:: fs:: read_dir ( & directory) ?. any ( |entry| {
278
272
let entry = entry. unwrap ( ) ;
279
273
entry. path ( ) . is_file ( )
@@ -294,42 +288,47 @@ impl ParquetDataCatalog {
294
288
start : Option < UnixNanos > ,
295
289
end : Option < UnixNanos > ,
296
290
where_clause : Option < & str > ,
297
- ) -> datafusion :: error :: Result < QueryResult >
291
+ ) -> Result < QueryResult >
298
292
where
299
293
T : DecodeDataFromRecordBatch + CatalogPathPrefix ,
300
294
{
301
- let path_str = path. to_str ( ) . unwrap ( ) ;
302
- let table_name = path. file_stem ( ) . unwrap ( ) . to_str ( ) . unwrap ( ) ;
295
+ let path_str = path. to_str ( ) . expect ( "Failed to convert path to string" ) ;
296
+ let table_name = path
297
+ . file_stem ( )
298
+ . unwrap ( )
299
+ . to_str ( )
300
+ . expect ( "Failed to convert path to string" ) ;
303
301
let query = build_query ( table_name, start, end, where_clause) ;
304
302
self . session
305
303
. add_file :: < T > ( table_name, path_str, Some ( & query) ) ?;
304
+
306
305
Ok ( self . session . get_query_result ( ) )
307
306
}
308
307
309
308
/// Query data loaded in the catalog
310
309
pub fn query_directory < T > (
311
310
& mut self ,
312
- // use instrument_ids or bar_types to query specific subset of the data
313
311
instrument_ids : Vec < String > ,
314
312
start : Option < UnixNanos > ,
315
313
end : Option < UnixNanos > ,
316
314
where_clause : Option < & str > ,
317
- ) -> datafusion :: error :: Result < QueryResult >
315
+ ) -> Result < QueryResult >
318
316
where
319
317
T : DecodeDataFromRecordBatch + CatalogPathPrefix ,
320
318
{
321
319
let mut paths = Vec :: new ( ) ;
322
- for instrument_id in & instrument_ids {
323
- paths. push ( self . make_path ( T :: path_prefix ( ) , Some ( instrument_id) , None ) ) ;
320
+
321
+ for instrument_id in instrument_ids {
322
+ paths. extend ( self . query_parquet_files ( T :: path_prefix ( ) , Some ( instrument_id) ) ?) ;
324
323
}
325
324
326
325
// If no specific instrument_id is selected query all files for the data type
327
326
if paths. is_empty ( ) {
328
- paths. push ( self . make_path ( T :: path_prefix ( ) , None , None ) ) ;
327
+ paths. push ( self . make_path ( T :: path_prefix ( ) , None , None ) ? ) ;
329
328
}
330
329
331
330
for path in & paths {
332
- let path = path. to_str ( ) . unwrap ( ) ;
331
+ let path = path. to_str ( ) . expect ( "Failed to convert path to string" ) ;
333
332
let query = build_query ( path, start, end, where_clause) ;
334
333
self . session . add_file :: < T > ( path, path, Some ( & query) ) ?;
335
334
}
@@ -338,12 +337,13 @@ impl ParquetDataCatalog {
338
337
}
339
338
340
339
#[ allow( dead_code) ]
341
- fn query_timestamp_bound (
340
+ pub fn query_timestamp_bound (
342
341
& self ,
343
342
data_cls : & str ,
344
- instrument_id : Option < & String > ,
345
- is_last : bool ,
343
+ instrument_id : Option < String > ,
344
+ is_last : Option < bool > ,
346
345
) -> Result < Option < i64 > > {
346
+ let is_last = is_last. unwrap_or ( true ) ;
347
347
let parquet_files = self . query_parquet_files ( data_cls, instrument_id) ?;
348
348
349
349
if parquet_files. is_empty ( ) {
@@ -354,8 +354,8 @@ impl ParquetDataCatalog {
354
354
. iter ( )
355
355
. map ( |file| min_max_from_parquet_metadata ( file, "ts_init" ) )
356
356
. collect :: < Result < Vec < _ > , _ > > ( ) ?;
357
-
358
357
let mut timestamps: Vec < i64 > = Vec :: new ( ) ;
358
+
359
359
for min_max in min_max_per_file {
360
360
let ( min, max) = min_max;
361
361
@@ -377,17 +377,12 @@ impl ParquetDataCatalog {
377
377
}
378
378
}
379
379
380
- fn query_parquet_files (
380
+ pub fn query_parquet_files (
381
381
& self ,
382
382
type_name : & str ,
383
- instrument_id : Option < & String > ,
383
+ instrument_id : Option < String > ,
384
384
) -> Result < Vec < PathBuf > > {
385
- let mut path = self . base_path . join ( "data" ) . join ( type_name) ;
386
-
387
- if let Some ( id) = instrument_id {
388
- path = path. join ( id) ;
389
- }
390
-
385
+ let path = self . make_directory_path ( type_name, instrument_id) ;
391
386
let mut files = Vec :: new ( ) ;
392
387
393
388
if path. exists ( ) {
0 commit comments