@@ -9,14 +9,18 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
99 """ Populate the DSFF file from an ARFF file. """
1010 path = fix_path (dsff , path , ".arff" )
1111 dsff .logger .debug (f"creating DSFF from { path } ..." )
12- d = []
12+ d , features = [], {}
1313 with open (path ) as f :
1414 relation , attributes , data = False , [False , False ], False
1515 for n , l in enumerate (f , 1 ):
1616 l = l .strip ()
17- # ignore comments before @RELATION
18- if l .startswith ("#" ):
19- continue
17+ # get metadata and feature descriptions from comments
18+ if l .startswith ("%" ):
19+ if (m := re .match (r"^\%\s+(.*?)\s*\:\s*(.*?)$" , l )):
20+ name , descr = m .groups ()
21+ features [name ] = descr
22+ elif re .match (r"^\%\d+metadata\s*\:\s*\{.*\}$" ):
23+ dsff .write (metadata = literal_eval (l .split (":" , 1 )))
2024 if not relation :
2125 if l .startswith ("@RELATION " ):
2226 relation = True
@@ -73,12 +77,7 @@ def from_arff(dsff, path=None, target=TARGET_NAME, missing=MISSING_TOKEN):
7377 for j , row in enumerate (d ):
7478 if j > 0 :
7579 row [i ] = {'0' : "False" , '1' : "True" }[row [i ]]
76- dsff .write (d )
77- features = {}
78- for headers in dsff ['data' ].rows :
79- for header in headers :
80- features [header .value ] = ""
81- break
80+ dsff .write (data = d )
8281 dsff .write (features = features )
8382
8483
@@ -144,9 +143,11 @@ def to_arff(dsff, path=None, target=TARGET_NAME, exclude=DEFAULT_EXCL, missing=M
144143 mlen_c = [max (x , len (row [k ]) if types [k ] == "NUMERIC" else len (row [k ])+ 2 ) for k , x in enumerate (mlen_c )]
145144 d .append (row )
146145 # format the resulting data and output the ARFF
147- d = "\n " .join (" " .join (("{: <%s}" % (mlen_c [k ]+ 1 )).format ((x if types [k ] == "NUMERIC" or x == MISSING_TOKEN else \
148- "'%s'" % x ) + "," ) for k , x in enumerate (row )).rstrip (" ," ) for row in d )
149- arff = "@RELATION \" {}\" \n \n {}\n \n @DATA\n {}" .format (name , "\n " .join (a ), d )
146+ d = (nl := "\n " ).join (" " .join (("{: <%s}" % (mlen_c [k ]+ 1 )).format ((x if types [k ] == "NUMERIC" or \
147+ x == MISSING_TOKEN else "'%s'" % x ) + "," ) for k , x in enumerate (row )).rstrip (" ," ) for row in d )
148+ arff = f"@RELATION \" { name } \" \n \n { nl .join (a )} \n \n @DATA\n { d } \n \n " \
149+ f"{ ['' , f'% metadata: { json .dumps (dsff .metadata )} ' ][len (dsff .metadata ) > 0 ]} \n \n " \
150+ f"{ nl .join (f'% { name } : { descr } ' for name , descr in dsff .features .items ())} "
150151 if text :
151152 return arff
152153 with open (path , 'w+' ) as f :
0 commit comments