@@ -33,6 +33,8 @@ class Java:
3333
3434 class ProcessorDetails :
3535 version = '0.0.1'
36+ description = "Decompresses Cerner LZW compressed blobs from a JSON input stream"
37+ tags = ["cerner" , "oracle" , "blob" ]
3638
3739 def __init__ (self , jvm : JVMView ):
3840 super ().__init__ (jvm )
@@ -110,6 +112,7 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
110112 """
111113
112114 output_contents : list = []
115+ attributes : dict = {k : str (v ) for k , v in flowFile .getAttributes ().items ()}
113116
114117 try :
115118 self .process_context = context
@@ -118,7 +121,7 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
118121 # read avro record
119122 input_raw_bytes : bytes | bytearray = flowFile .getContentsAsBytes ()
120123
121- records = []
124+ records : list | dict = []
122125
123126 try :
124127 records = json .loads (input_raw_bytes .decode ())
@@ -131,35 +134,70 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
131134 try :
132135 records = json .loads (input_raw_bytes .decode ("windows-1252" ))
133136 except json .JSONDecodeError as e :
134- self .logger .error (f"Error decoding JSON: { str (e )} \n with windows-1252" )
135- raise
137+ return self .build_failure_result (
138+ flowFile ,
139+ ValueError (f"Error decoding JSON: { str (e )} \n with windows-1252" ),
140+ attributes = attributes ,
141+ contents = input_raw_bytes ,
142+ )
136143
137144 if not isinstance (records , list ):
138145 records = [records ]
139146
140147 if not records :
141- raise ValueError ("No records found in JSON input" )
148+ return self .build_failure_result (
149+ flowFile ,
150+ ValueError ("No records found in JSON input" ),
151+ attributes = attributes ,
152+ contents = input_raw_bytes ,
153+ )
154+
155+ # sanity check: blobs are from the same document_id
156+ doc_ids : set = {str (r .get (self .document_id_field_name , "" )) for r in records }
157+ if len (doc_ids ) > 1 :
158+ return self .build_failure_result (
159+ flowFile ,
160+ ValueError (f"Multiple document IDs in one FlowFile: { list (doc_ids )} " ),
161+ attributes = attributes ,
162+ contents = input_raw_bytes ,
163+ )
142164
143- concatenated_blob_sequence_order = {}
144- output_merged_record = {}
165+ concatenated_blob_sequence_order : dict = {}
166+ output_merged_record : dict = {}
145167
146- have_any_sequence = any (self .blob_sequence_order_field_name in record for record in records )
147- have_any_no_sequence = any (self .blob_sequence_order_field_name not in record for record in records )
168+ have_any_sequence : bool = any (self .blob_sequence_order_field_name in record for record in records )
169+ have_any_no_sequence : bool = any (self .blob_sequence_order_field_name not in record for record in records )
148170
149171 if have_any_sequence and have_any_no_sequence :
150- raise ValueError (
151- f"Mixed records: some have '{ self .blob_sequence_order_field_name } ', some don't. "
152- "Cannot safely reconstruct blob stream."
172+ return self .build_failure_result (
173+ flowFile ,
174+ ValueError (
175+ f"Mixed records: some have '{ self .blob_sequence_order_field_name } ', some don't. "
176+ "Cannot safely reconstruct blob stream."
177+ ),
178+ attributes = attributes ,
179+ contents = input_raw_bytes ,
153180 )
154181
155182 for record in records :
156183 if self .binary_field_name not in record or record [self .binary_field_name ] in (None , "" ):
157- raise ValueError (f"Missing '{ self .binary_field_name } ' in a record" )
184+ return self .build_failure_result (
185+ flowFile ,
186+ ValueError (f"Missing '{ self .binary_field_name } ' in a record" ),
187+ attributes = attributes ,
188+ contents = input_raw_bytes ,
189+ )
158190
159191 if have_any_sequence :
160192 seq = int (record [self .blob_sequence_order_field_name ])
161193 if seq in concatenated_blob_sequence_order :
162- raise ValueError (f"Duplicate { self .blob_sequence_order_field_name } : { seq } " )
194+ return self .build_failure_result (
195+ flowFile ,
196+ ValueError (f"Duplicate { self .blob_sequence_order_field_name } : { seq } " ),
197+ attributes = attributes ,
198+ contents = input_raw_bytes ,
199+ )
200+
163201 concatenated_blob_sequence_order [seq ] = record [self .binary_field_name ]
164202 else :
165203 # no sequence anywhere: preserve record order (0..n-1)
@@ -174,48 +212,100 @@ def transform(self, context: ProcessContext, flowFile: JavaObject) -> FlowFileTr
174212
175213 full_compressed_blob = bytearray ()
176214
177- for k in sorted (concatenated_blob_sequence_order .keys ()):
215+ # double check to make sure there is no gap in the blob sequence, i.e missing blob.
216+ order_of_blobs_keys = sorted (concatenated_blob_sequence_order .keys ())
217+ for i in range (1 , len (order_of_blobs_keys )):
218+ if order_of_blobs_keys [i ] != order_of_blobs_keys [i - 1 ] + 1 :
219+ return self .build_failure_result (
220+ flowFile ,
221+ ValueError (
222+ f"Sequence gap: missing { order_of_blobs_keys [i - 1 ] + 1 } "
223+ f"(have { order_of_blobs_keys [i - 1 ]} then { order_of_blobs_keys [i ]} )"
224+ ),
225+ attributes = attributes ,
226+ contents = input_raw_bytes ,
227+ )
228+
229+ for k in order_of_blobs_keys :
178230 v = concatenated_blob_sequence_order [k ]
179231
232+ temporary_blob : bytes = b""
233+
180234 if self .binary_field_source_encoding == "base64" :
181235 if not isinstance (v , str ):
182- raise ValueError (f"Expected base64 string in { self .binary_field_name } for part { k } , got { type (v )} " )
236+ return self .build_failure_result (
237+ flowFile ,
238+ ValueError (
239+ f"Expected base64 string in { self .binary_field_name } for part { k } , got { type (v )} "
240+ ),
241+ attributes = attributes ,
242+ contents = input_raw_bytes ,
243+ )
183244 try :
184245 temporary_blob = base64 .b64decode (v , validate = True )
185246 except Exception as e :
186- raise ValueError (f"Error decoding base64 blob part { k } : { e } " )
247+ return self .build_failure_result (
248+ flowFile ,
249+ ValueError (f"Error decoding base64 blob part { k } : { e } " ),
250+ attributes = attributes ,
251+ contents = input_raw_bytes ,
252+ )
187253 else :
188254 # raw bytes path
189255 if isinstance (v , (bytes , bytearray )):
190256 temporary_blob = v
191257 else :
192- raise ValueError (f"Expected bytes in { self .binary_field_name } for part { k } , got { type (v )} " )
193-
258+ return self .build_failure_result (
259+ flowFile ,
260+ ValueError (
261+ f"Expected bytes in { self .binary_field_name } for part { k } , got { type (v )} "
262+ ),
263+ attributes = attributes ,
264+ contents = input_raw_bytes ,
265+ )
266+
194267 full_compressed_blob .extend (temporary_blob )
195268
269+ # build / add new attributes to dict before doing anything else to have some trace.
270+ attributes ["document_id_field_name" ] = str (self .document_id_field_name )
271+ attributes ["document_id" ] = str (output_merged_record .get (self .document_id_field_name , "" ))
272+ attributes ["binary_field" ] = str (self .binary_field_name )
273+ attributes ["output_text_field_name" ] = str (self .output_text_field_name )
274+ attributes ["mime.type" ] = "application/json"
275+ attributes ["blob_parts" ] = str (len (order_of_blobs_keys ))
276+ attributes ["blob_seq_min" ] = str (order_of_blobs_keys [0 ]) if order_of_blobs_keys else ""
277+ attributes ["blob_seq_max" ] = str (order_of_blobs_keys [- 1 ]) if order_of_blobs_keys else ""
278+ attributes ["compressed_len" ] = str (len (full_compressed_blob ))
279+ attributes ["compressed_head_hex" ] = bytes (full_compressed_blob [:16 ]).hex ()
280+
196281 try :
197282 decompress_blob = DecompressLzwCernerBlob ()
198283 decompress_blob .decompress (full_compressed_blob )
199- output_merged_record [self .binary_field_name ] = decompress_blob .output_stream
284+ output_merged_record [self .binary_field_name ] = bytes ( decompress_blob .output_stream )
200285 except Exception as exception :
201- self .logger .error (f"Error decompressing cerner blob: { str (exception )} \n " )
202- raise exception
286+ return self .build_failure_result (
287+ flowFile ,
288+ exception = exception ,
289+ attributes = attributes ,
290+ include_flowfile_attributes = False ,
291+ contents = input_raw_bytes
292+ )
203293
204294 if self .output_mode == "base64" :
205295 output_merged_record [self .binary_field_name ] = \
206296 base64 .b64encode (output_merged_record [self .binary_field_name ]).decode (self .output_charset )
207297
208298 output_contents .append (output_merged_record )
209299
210- attributes : dict = {k : str (v ) for k , v in flowFile .getAttributes ().items ()}
211- attributes ["document_id_field_name" ] = str (self .document_id_field_name )
212- attributes ["binary_field" ] = str (self .binary_field_name )
213- attributes ["output_text_field_name" ] = str (self .output_text_field_name )
214- attributes ["mime.type" ] = "application/json"
215-
216- return FlowFileTransformResult (relationship = "success" ,
300+ return FlowFileTransformResult (relationship = self .REL_SUCCESS ,
217301 attributes = attributes ,
218302 contents = json .dumps (output_contents ).encode ("utf-8" ))
219303 except Exception as exception :
220304 self .logger .error ("Exception during flowfile processing: " + traceback .format_exc ())
221- raise exception
305+ return self .build_failure_result (
306+ flowFile ,
307+ exception ,
308+ attributes = attributes ,
309+ contents = locals ().get ("input_raw_bytes" , flowFile .getContentsAsBytes ()),
310+ include_flowfile_attributes = False
311+ )
0 commit comments