Skip to content

Commit a55ae57

Browse files
committed
5.1.0 to 6.0.0 changes
1 parent d06ace0 commit a55ae57

39 files changed

Lines changed: 1630 additions & 36 deletions

mlperf_logging/benchmark_meta.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,16 @@
155155
'llama2_70b_lora',
156156
'rgat',
157157
'llama31_405b'
158-
]
158+
],
159+
'6.0': [
160+
'llama31_8b',
161+
'dlrm_dcnv2',
162+
'retinanet',
163+
'flux1',
164+
'llama2_70b_lora',
165+
'rgat',
166+
'llama31_405b'
167+
]
159168
},
160169

161170
'hpc': {

mlperf_logging/compliance_checker/README.md

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ To check a log file for compliance:
1010

1111
python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
1212

13-
By default, 5.1.0 training edition rules are used and the default config is set to `5.1.0/common.yaml`.
13+
By default, 6.0.0 training edition rules are used and the default config is set to `6.0.0/common.yaml`.
1414
This config will check all common keys and enqueue benchmark specific config to be checked as well.
15-
Old training editions, still supported are 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
15+
Old training editions, still supported are 6.0.0, 5.1.0, 5.0.0, 4.1.0, 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
1616

1717
To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
1818

@@ -22,23 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
2222

2323
### Existing config files for training submissions
2424

25-
5.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26-
5.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27-
5.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28-
5.1.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
29-
5.1.0/closed_llama31_8b.yaml
30-
5.1.0/closed_llama31_405b.yaml
31-
5.1.0/closed_dlrm_dcnv2.yaml
32-
5.1.0/closed_rgat.yaml
33-
5.1.0/closed_llama2_70b_lora.yaml
34-
5.1.0/closed_flux1.yaml
35-
5.1.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
36-
5.1.0/open_llama31_8b.yaml
37-
5.1.0/open_llama31_405b.yaml
38-
5.1.0/open_dlrm_dcnv2.yaml
39-
5.1.0/open_rgat.yaml
40-
5.1.0/open_llama2_70b_lora.yaml
41-
5.1.0/open_flux1.yaml
25+
6.0.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
26+
6.0.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
27+
6.0.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
28+
6.0.0/closed_retinanet.yaml - Per-benchmark rules, closed submissions.
29+
6.0.0/closed_llama31_8b.yaml
30+
6.0.0/closed_llama31_405b.yaml
31+
6.0.0/closed_dlrm_dcnv2.yaml
32+
6.0.0/closed_rgat.yaml
33+
6.0.0/closed_llama2_70b_lora.yaml
34+
6.0.0/closed_flux1.yaml
35+
6.0.0/open_retinanet.yaml - Per-benchmark rules, open submissions.
36+
6.0.0/open_llama31_8b.yaml
37+
6.0.0/open_llama31_405b.yaml
38+
6.0.0/open_dlrm_dcnv2.yaml
39+
6.0.0/open_rgat.yaml
40+
6.0.0/open_llama2_70b_lora.yaml
41+
6.0.0/open_flux1.yaml
4242

4343
### Existing config files for HPC submissions
4444

mlperf_logging/compliance_checker/mlp_compliance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ def get_parser():
315315
parser.add_argument('--usage', type=str, default='training',
316316
choices=usage_choices(),
317317
help='what WG do the benchmarks come from')
318-
parser.add_argument('--ruleset', type=str, default='5.1.0',
318+
parser.add_argument('--ruleset', type=str, default='6.0.0',
319319
choices=rule_choices(),
320320
help='what version of rules to check the log against')
321321
parser.add_argument('--config', type=str,

mlperf_logging/compliance_checker/mlp_parser/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from .ruleset_410 import parse_file as parse_file_410
1111
from .ruleset_500 import parse_file as parse_file_500
1212
from .ruleset_510 import parse_file as parse_file_510
13+
from .ruleset_600 import parse_file as parse_file_600
1314

1415
def parse_file(filename, ruleset='0.6.0'):
1516
if ruleset == '0.6.0':
@@ -36,5 +37,7 @@ def parse_file(filename, ruleset='0.6.0'):
3637
return parse_file_500(filename)
3738
elif ruleset == '5.1.0':
3839
return parse_file_510(filename)
40+
elif ruleset == '6.0.0':
41+
return parse_file_600(filename)
3942
else:
4043
raise Exception(f'Ruleset "{ruleset}" is not supported')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
'''
2+
Parses a text MLPerf log into a structured format.
3+
'''
4+
5+
from __future__ import print_function
6+
7+
import collections
8+
import json
9+
import re
10+
import sys
11+
from dataclasses import dataclass
12+
13+
from io import open
14+
15+
@dataclass
16+
class LogLine:
17+
"""Class for keeping track of an item in inventory."""
18+
full_string: str
19+
timestamp: float
20+
key: str
21+
value: str
22+
lineno: int
23+
24+
TOKEN = ':::MLLOG '
25+
26+
27+
def parse_line(line):
28+
if not line.startswith(TOKEN):
29+
return None
30+
31+
return json.loads(line[len(TOKEN):])
32+
33+
34+
def string_to_logline(lineno, string):
35+
''' Returns a LogLine or raises a ValueError '''
36+
m = parse_line(string)
37+
38+
if m is None:
39+
raise ValueError('does not match regex')
40+
41+
args = []
42+
args.append(string) # full string
43+
44+
ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
45+
# TODO check for weird values
46+
args.append(ts)
47+
48+
args.append(m['key']) # key
49+
50+
j = { 'value': m['value'], 'metadata': m['metadata'] }
51+
args.append(j)
52+
53+
args.append(lineno)
54+
return LogLine(*args)
55+
56+
57+
def parse_file(filename):
58+
''' Reads a file by name and returns list of loglines and list of errors'''
59+
with open(filename, encoding='latin-1') as f:
60+
return parse_generator(f)
61+
62+
63+
def strip_and_dedup(gen):
64+
lines = []
65+
for l in gen:
66+
if TOKEN not in l:
67+
continue
68+
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
69+
return lines
70+
71+
72+
73+
def parse_generator(gen):
74+
''' Reads a generator of lines and returns (loglines, errors)
75+
The list of errors are any parsing issues as a tuple (str_line, error_msg)
76+
'''
77+
loglines = []
78+
failed = []
79+
for lineno, line in enumerate(strip_and_dedup(gen)):
80+
line = line.strip()
81+
try:
82+
ll = string_to_logline(lineno, line)
83+
loglines.append(ll)
84+
except ValueError as e:
85+
failed.append((line, str(e)))
86+
return loglines, failed
87+
88+
89+
if __name__ == '__main__':
90+
if len(sys.argv) != 2:
91+
print('usage: mlp_parser.py FILENAME')
92+
print(' tests parsing on the file.')
93+
sys.exit(1)
94+
95+
filename = sys.argv[1]
96+
lines, errors = parse_file(filename)
97+
98+
print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
99+
100+
if len(errors) > 0:
101+
print('Lines which failed to parse:')
102+
for line, error in errors:
103+
print(' Following line failed: {}'.format(error))
104+
print(line)
105+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# MLPerf package checker
2+
3+
MLPerf package checker
4+
5+
## Usage
6+
7+
To check an organization's submission package for compliance:
8+
9+
```sh
10+
python3 -m mlperf_logging.package_checker FOLDER USAGE RULESET
11+
```
12+
13+
Currently, USAGE in ["training"] and RULESET in ["0.6.0", "0.7.0", "1.0.0", "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "6.0.0", "6.0.0"] are supported.
14+
15+
The package checker checks:
16+
1. The number of result files for each benchmark matches the required count. If
17+
the actual and required counts do not match, an error is printed.
18+
2. For every result file, the logging within the file is compliant. If there are
19+
any compliance errors, they are printed.
20+
3. For every result directory whether the seed checker compliance is met
21+
4. For every result directory whether the reference convergence point compliance (RCP) is met.
22+
5. The system description json files in the systems directory for compliance.
23+
24+
## Tested software versions
25+
Tested and confirmed working using the following software versions:
26+
27+
Python 3.7.7
28+
Python 3.9.2
29+
Python 3.9.10

0 commit comments

Comments
 (0)