Skip to content

Commit 838fa81

Browse files
authored
Merge pull request #162 from fanlai0990/master
[Async FL] Support Papaya implementation
2 parents d640c27 + 0448ede commit 838fa81

16 files changed

Lines changed: 509 additions & 195 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ Now that you have FedScale installed, you can start exploring FedScale following
6060

6161
***We are adding more datasets! Please contribute!***
6262

63-
FedScale consists of 20+ large-scale, heterogeneous FL datasets covering computer vision (CV), natural language processing (NLP), and miscellaneous tasks.
63+
FedScale consists of 20+ large-scale, heterogeneous FL datasets and 70+ various [models](./fedscale/utils/models/cv_models/README.md), covering computer vision (CV), natural language processing (NLP), and miscellaneous tasks.
6464
Each one is associated with its training, validation, and testing datasets.
6565
We acknowledge the contributors of these raw datasets. Please go to the `./benchmark/dataset` directory and follow the dataset [README](./benchmark/dataset/README.md) for more details.
6666

benchmark/configs/async_fl/async_fl.yml

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ ps_ip: localhost
88
# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1
99
# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3
1010
worker_ips:
11-
- localhost:[2]
11+
- localhost:[2,2,2,2]
1212

1313
exp_path: $FEDSCALE_HOME/fedscale/core
1414

@@ -28,30 +28,34 @@ setup_commands:
2828
# ========== Additional job configuration ==========
2929
# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found
3030

31+
# NOTE: We are supporting and improving the following implementation (Async FL) in FedScale:
32+
# - "PAPAYA: Practical, Private, and Scalable Federated Learning", MLSys, 2022
33+
# - "Federated Learning with Buffered Asynchronous Aggregation", AISTATS, 2022
34+
35+
# We appreciate you to contribute and/or report bugs. Thank you!
36+
3137
job_conf:
32-
- job_name: femnist # Generate logs under this folder: log_path/job_name/time_stamp
33-
- log_path: $FEDSCALE_HOME/benchmark # Path of log files
34-
- num_participants: 800 # Number of participants per round, we use K=100 in our paper, large K will be much slower
35-
- data_set: femnist # Dataset: openImg, google_speech, stackoverflow
38+
- job_name: async_femnist # Generate logs under this folder: log_path/job_name/time_stamp
39+
- log_path: $FEDSCALE_HOME/benchmark # Path of log files
40+
- data_set: femnist # Dataset: openImg, google_speech, stackoverflow
3641
- data_dir: $FEDSCALE_HOME/benchmark/dataset/data/femnist # Path of the dataset
3742
- data_map_file: $FEDSCALE_HOME/benchmark/dataset/data/femnist/client_data_mapping/train.csv # Allocation of data to each client, turn to iid setting if not provided
3843
- device_conf_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace
3944
- device_avail_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_behave_trace
40-
- model: shufflenet_v2_x2_0 # Models: e.g., shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2
41-
- eval_interval: 20 # How many rounds to run a testing on the testing set
42-
- rounds: 500 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
45+
- model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
46+
# - model_zoo: fedscale-zoo
47+
- eval_interval: 5 # How many rounds to run a testing on the testing set
48+
- rounds: 1000 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
4349
- filter_less: 21 # Remove clients w/ less than 21 samples
4450
- num_loaders: 2
45-
- local_steps: 20
51+
- local_steps: 5
4652
- learning_rate: 0.05
4753
- batch_size: 20
4854
- test_bsz: 20
49-
- use_cuda: False
50-
- decay_round: 50
55+
- ps_port: 12342
56+
- use_cuda: True
5157
- overcommitment: 1.0
52-
- async_buffer: 10
53-
- arrival_interval: 3
54-
55-
56-
57-
58+
- arrival_interval: 5
59+
- max_staleness: 5
60+
- max_concurrency: 100
61+
- async_buffer: 50 # Number of updates need to be aggregated before generating new model version

benchmark/configs/cifar_cpu/cifar_cpu.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ job_conf:
3434
- num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower
3535
- data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow
3636
- data_dir: $FEDSCALE_HOME/benchmark/dataset/data/ # Path of the dataset
37-
- model: shufflenet_v2_x2_0 # Models: e.g., shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2# - gradient_policy: yogi # {"fed-yogi", "fed-prox", "fed-avg"}, "fed-avg" by default
37+
- model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
38+
# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well
3839
- eval_interval: 5 # How many rounds to run a testing on the testing set
3940
- rounds: 600 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
4041
- filter_less: 0 # Remove clients w/ less than 21 samples
4142
- num_loaders: 2
4243
- local_steps: 20
43-
- learning_rate: 0.001
44+
- learning_rate: 0.05
4445
- batch_size: 32
4546
- test_bsz: 32
4647
- use_cuda: False

benchmark/configs/femnist/conf.yml

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22

33
# ========== Cluster configuration ==========
44
# ip address of the parameter server (need 1 GPU process)
5-
ps_ip: localhost
5+
ps_ip: 10.0.0.1
66

77
# ip address of each worker:# of available gpus process on each gpu in this node
88
# Note that if we collocate ps and worker on same GPU, then we need to decrease this number of available processes on that GPU by 1
99
# E.g., master node has 4 available processes, then 1 for the ps, and worker should be set to: worker:3
10-
worker_ips:
11-
- localhost:[2]
10+
worker_ips:
11+
- 10.0.0.1:[4]
1212

1313
exp_path: $FEDSCALE_HOME/fedscale/core
1414

@@ -31,24 +31,21 @@ setup_commands:
3131
job_conf:
3232
- job_name: femnist # Generate logs under this folder: log_path/job_name/time_stamp
3333
- log_path: $FEDSCALE_HOME/benchmark # Path of log files
34-
- num_participants: 20 # Number of participants per round, we use K=100 in our paper, large K will be much slower
34+
- num_participants: 50 # Number of participants per round, we use K=100 in our paper, large K will be much slower
3535
- data_set: femnist # Dataset: openImg, google_speech, stackoverflow
3636
- data_dir: $FEDSCALE_HOME/benchmark/dataset/data/femnist # Path of the dataset
3737
- data_map_file: $FEDSCALE_HOME/benchmark/dataset/data/femnist/client_data_mapping/train.csv # Allocation of data to each client, turn to iid setting if not provided
3838
- device_conf_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_device_capacity # Path of the client trace
3939
- device_avail_file: $FEDSCALE_HOME/benchmark/dataset/data/device_info/client_behave_trace
40-
- model: shufflenet_v2_x2_0 # Models: e.g., shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2
41-
- gradient_policy: yogi # {"fed-yogi", "fed-prox", "fed-avg"}, "fed-avg" by default
42-
- eval_interval: 30 # How many rounds to run a testing on the testing set
43-
- rounds: 5000 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
40+
- model: resnet18 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
41+
# - model_zoo: fedscale-zoo
42+
- eval_interval: 10 # How many rounds to run a testing on the testing set
43+
- rounds: 1000 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
4444
- filter_less: 21 # Remove clients w/ less than 21 samples
4545
- num_loaders: 2
46-
- yogi_eta: 3e-3
47-
- yogi_tau: 1e-8
48-
- local_steps: 20
46+
- local_steps: 5
4947
- learning_rate: 0.05
5048
- batch_size: 20
5149
- test_bsz: 20
52-
- malicious_factor: 4
53-
- use_cuda: False
50+
- use_cuda: True
5451

0 commit comments

Comments
 (0)