Practical-Formal-Methods · jiradeto · Apr 6, 2022 · Apr 11, 2022 · Apr 11, 2022 · Apr 11, 2022
diff --git a/common/experiment_utils.py b/common/experiment_utils.py
@@ -72,6 +72,12 @@ def get_oss_fuzz_corpora_filestore_path():
     return posixpath.join(get_experiment_filestore_path(), 'oss_fuzz_corpora')
 
 
+def get_random_seed_corpora_filestore_path():
+    """Returns path containing the user-provided seed corpora."""
+    return posixpath.join(get_experiment_filestore_path(),
+                          'random_seed_corpora')
+
+
 def get_dispatcher_instance_name(experiment: str) -> str:
     """Returns a dispatcher instance name for an experiment."""
     return 'd-%s' % experiment

diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh
@@ -46,6 +46,7 @@ docker run \
 -e NO_SEEDS={{no_seeds}} \
 -e NO_DICTIONARIES={{no_dictionaries}} \
 -e OSS_FUZZ_CORPUS={{oss_fuzz_corpus}} \
+-e RANDOM_SEED_CORPUS={{random_seed_corpus}} \
 -e DOCKER_REGISTRY={{docker_registry}} {% if not local_experiment %}-e CLOUD_PROJECT={{cloud_project}} -e CLOUD_COMPUTE_ZONE={{cloud_compute_zone}} {% endif %}\
 -e EXPERIMENT_FILESTORE={{experiment_filestore}} {% if local_experiment %}-v {{experiment_filestore}}:{{experiment_filestore}} {% endif %}\
 -e REPORT_FILESTORE={{report_filestore}} {% if local_experiment %}-v {{report_filestore}}:{{report_filestore}} {% endif %}\

diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py
@@ -22,6 +22,7 @@
 import sys
 import tarfile
 import tempfile
+import zipfile
 from typing import Dict, List
 
 import jinja2
@@ -63,6 +64,10 @@
     'gs://{project}-backup.clusterfuzz-external.appspot.com/corpus/'
     'libFuzzer/{fuzz_target}/public.zip')
 
+# max size allowed per seed corpus for AFL
+CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024
+RANDOM_CORPORA_ZIP_DIR_NAME = "random_seed_corpora_zip"
+
 
 def read_and_validate_experiment_config(config_filename: str) -> Dict:
     """Reads |config_filename|, validates it, finds as many errors as possible,
@@ -148,6 +153,54 @@ def get_directories(parent_dir):
     ]
 
 
+# pylint: disable=too-many-locals
+def validate_and_pack_random_seed_corpus(random_seed_corpus, benchmarks):
+    """Validate and archive seed corpus provided by user and."""
+    if not os.path.isdir(random_seed_corpus):
+        raise ValidationError('Corpus location "%s" is invalid.' %
+                              random_seed_corpus)
+
+    with tempfile.TemporaryDirectory() as zip_dir:
+        for benchmark in benchmarks:
+            benchmark_corpus_dir = os.path.join(random_seed_corpus, benchmark)
+            if not os.path.exists(benchmark_corpus_dir):
+                raise ValidationError('Random seed corpus directory for '
+                                      'benchmark "%s" does not exist.' %
+                                      benchmark)
+            if not os.path.isdir(benchmark_corpus_dir):
+                raise ValidationError('Seed corpus of benchmark "%s" must be '
+                                      'a directory.' % benchmark)
+            if not os.listdir(benchmark_corpus_dir):
+                raise ValidationError(
+                    'Seed corpus of benchmark "%s" is empty.' % benchmark)
+
+            valid_corpus_files = set()
+            for root, _, files in os.walk(benchmark_corpus_dir):
+                for filename in files:
+                    file_path = os.path.join(root, filename)
+                    file_size = os.path.getsize(file_path)
+
+                    if file_size == 0 or file_size > CORPUS_ELEMENT_BYTES_LIMIT:
+                        continue
+                    valid_corpus_files.add(file_path)
+
+            if not valid_corpus_files:
+                raise ValidationError('No valid corpus files for "%s"' %
+                                      benchmark)
+
+            benchmark_corpus_archive_path = os.path.join(zip_dir, f'{benchmark}.zip')
+            with zipfile.ZipFile(benchmark_corpus_archive_path, 'w') as archive:
+                for filename in valid_corpus_files:
+                    dir_name = os.path.dirname(filename)
+                    archive.write(
+                        filename,
+                        os.path.relpath(filename, os.path.join(dir_name, '..')))
+
+        random_seed_corpora_zip_dir = os.path.join(random_seed_corpus,
+                                                   RANDOM_CORPORA_ZIP_DIR_NAME)
+        filesystem.replace_dir(zip_dir, random_seed_corpora_zip_dir)
+
+
 def validate_benchmarks(benchmarks: List[str]):
     """Parses and validates list of benchmarks."""
     benchmark_types = set()
@@ -220,7 +273,8 @@ def start_experiment(  # pylint: disable=too-many-arguments
         concurrent_builds=None,
         measurers_cpus=None,
         runners_cpus=None,
-        use_branch_coverage=False):
+        use_branch_coverage=False,
+        random_seed_corpus=None):
     """Start a fuzzer benchmarking experiment."""
     if not allow_uncommitted_changes:
         check_no_uncommitted_changes()
@@ -250,6 +304,12 @@ def start_experiment(  # pylint: disable=too-many-arguments
     # 12GB is just the amount that KLEE needs, use this default to make KLEE
     # experiments easier to run.
     config['runner_memory'] = config.get('runner_memory', '12GB')
+
+    config['random_seed_corpus'] = random_seed_corpus
+    if config['random_seed_corpus']:
+        validate_and_pack_random_seed_corpus(config['random_seed_corpus'],
+                                             benchmarks)
+
     return start_experiment_from_full_config(config)
 
 
@@ -332,6 +392,15 @@ def filter_file(tar_info):
         for benchmark in config['benchmarks']:
             add_oss_fuzz_corpus(benchmark, oss_fuzz_corpora_dir)
 
+    if config['random_seed_corpus']:
+        random_seed_corpus_zip = os.path.join(config['random_seed_corpus'],
+                                              RANDOM_CORPORA_ZIP_DIR_NAME)
+        filestore_utils.cp(
+            random_seed_corpus_zip,
+            experiment_utils.get_random_seed_corpora_filestore_path(),
+            recursive=True,
+            parallel=True)
+
 
 class BaseDispatcher:
     """Class representing the dispatcher."""
@@ -524,6 +593,10 @@ def main():
                         '--runners-cpus',
                         help='Cpus available to the runners.',
                         required=False)
+    parser.add_argument('-rs',
+                        '--random-seed-corpus',
+                        help='Path to the random seed corpus',
+                        required=False)
 
     all_fuzzers = fuzzer_utils.get_fuzzer_names()
     parser.add_argument('-f',
@@ -593,6 +666,14 @@ def main():
         parser.error('The sum of runners and measurers cpus is greater than the'
                      ' available cpu cores (%d)' % os.cpu_count())
 
+    if args.random_seed_corpus:
+        if args.no_seeds:
+            parser.error(
+                'You cannot start an experiment with no_seeds option if'
+                ' seeds location is provided you')
+        if args.oss_fuzz_corpus:
+            parser.error('Cannot use seeds from multiple sources')
+
     start_experiment(args.experiment_name,
                      args.experiment_config,
                      args.benchmarks,
@@ -605,7 +686,8 @@ def main():
                      concurrent_builds=concurrent_builds,
                      measurers_cpus=measurers_cpus,
                      runners_cpus=runners_cpus,
-                     use_branch_coverage=args.use_branch_coverage)
+                     use_branch_coverage=args.use_branch_coverage,
+                     random_seed_corpus=args.random_seed_corpus)
     return 0
 
 

diff --git a/experiment/runner.py b/experiment/runner.py
@@ -27,6 +27,7 @@
 import threading
 import time
 import zipfile
+import random
 
 from common import benchmark_config
 from common import environment
@@ -115,6 +116,20 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path):
     return seed_corpus_path if os.path.exists(seed_corpus_path) else None
 
 
+def _unpack_random_seed_corpus(corpus_directory):
+    "Unpack and randomply pick one input from the seed corpus provided by user"
+    # remove initial seed corpus
+    shutil.rmtree(corpus_directory)
+    os.mkdir(corpus_directory)
+    benchmark = environment.get('BENCHMARK')
+    corpus_archive_filename = posixpath.join(
+        experiment_utils.get_random_seed_corpora_filestore_path(),
+        f'{benchmark}.zip')
+    with zipfile.ZipFile(corpus_archive_filename) as zip_file:
+        selected_file = random.choice(zip_file.infolist())
+        zip_file.extract(selected_file, corpus_directory)
+
+
 def _unpack_clusterfuzz_seed_corpus(fuzz_target_path, corpus_directory):
     """If a clusterfuzz seed corpus archive is available, unpack it into the
     corpus directory if it exists. Copied from unpack_seed_corpus in
@@ -172,7 +187,10 @@ def run_fuzzer(max_total_time, log_filename):
         logs.error('Fuzz target binary not found.')
         return
 
-    _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
+    if environment.get('RANDOM_SEED_CORPUS'):
+        _unpack_random_seed_corpus(input_corpus)
+    else:
+        _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
     _clean_seed_corpus(input_corpus)
 
     if max_total_time is None:

diff --git a/experiment/scheduler.py b/experiment/scheduler.py
@@ -717,6 +717,7 @@ def render_startup_script_template(instance_name: str, fuzzer: str,
         'oss_fuzz_corpus': experiment_config['oss_fuzz_corpus'],
         'num_cpu_cores': experiment_config['runner_num_cpu_cores'],
         'cpuset': CPUSET,
+        'random_seed_corpus': experiment_config['random_seed_corpus'],
     }
 
     if not local_experiment:

diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml
@@ -31,6 +31,7 @@ git_hash: "git-hash"
 no_seeds: false
 no_dictionaries: false
 oss_fuzz_corpus: false
+random_seed_corpus: false
 description: "Test experiment"
 concurrent_builds: null
 runners_cpus: null

diff --git a/experiment/test_run_experiment.py b/experiment/test_run_experiment.py
@@ -202,6 +202,7 @@ def test_copy_resources_to_bucket(tmp_path):
         'experiment': 'experiment',
         'benchmarks': ['libxslt_xpath'],
         'oss_fuzz_corpus': True,
+        'random_seed_corpus': False,
     }
     try:
         with mock.patch('common.filestore_utils.cp') as mocked_filestore_cp:

diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py
@@ -118,6 +118,7 @@ def test_create_trial_instance(benchmark, expected_image, expected_target,
 -e NO_SEEDS=False \\
 -e NO_DICTIONARIES=False \\
 -e OSS_FUZZ_CORPUS=False \\
+-e RANDOM_SEED_CORPUS=False \\
 -e DOCKER_REGISTRY=gcr.io/fuzzbench -e CLOUD_PROJECT=fuzzbench -e CLOUD_COMPUTE_ZONE=us-central1-a \\
 -e EXPERIMENT_FILESTORE=gs://experiment-data \\
 -e REPORT_FILESTORE=gs://web-reports \\