|
| 1 | +agent: "IPPO" # The agent name. |
| 2 | +env_name: "atari" # Name of the environment. |
| 3 | +env_id: "basketball_pong_v3" |
| 4 | +env_seed: 1 # The random seed of the environment. |
| 5 | +obs_type: "grayscale_image" # choice for Atari env: ram, rgb_image, grayscale_image |
| 6 | +img_size: [84, 84] # default is 210 x 160 in gym[Atari] |
| 7 | +num_stack: 4 # frame stack trick |
| 8 | +frame_skip: 4 # frame skip trick |
| 9 | +noop_max: 30 # Do no-op action for a number of steps in [1, noop_max]. |
| 10 | +learner: "IPPO_Learner" # The learner name. |
| 11 | +policy: "Categorical_MAAC_Policy" # The policy name. |
| 12 | +representation: "Basic_CNN" # The representation name. |
| 13 | +vectorize: "SubprocVecMultiAgentEnv" # The method to vectorize your environment such that can run in parallel. |
| 14 | +runner: "MARL" # The runner. |
| 15 | + |
| 16 | +# the following three arguments are for "Basic_CNN" representation. |
| 17 | +use_cnn: True # Whether to use convolutional neural networks as representation. |
| 18 | +filters: [32, 32, 64, 64] |
| 19 | +kernels: [8, 4, 4, 4] |
| 20 | +strides: [4, 2, 2, 2] |
| 21 | + |
| 22 | +# recurrent settings for Basic_RNN representation. |
| 23 | +use_rnn: False # If to use recurrent neural network as representation. (The representation should be "Basic_RNN"). |
| 24 | +rnn: "GRU" # The type of recurrent layer. |
| 25 | +fc_hidden_sizes: [64, 64, 64] # The hidden size of feed forward layer in RNN representation. |
| 26 | +recurrent_hidden_size: 64 # The hidden size of the recurrent layer. |
| 27 | +N_recurrent_layers: 1 # The number of recurrent layer. |
| 28 | +dropout: 0 # dropout should be a number in range [0, 1], the probability of an element being zeroed. |
| 29 | +normalize: "LayerNorm" # Layer normalization. |
| 30 | +initialize: "orthogonal" # Network initializer. |
| 31 | +gain: 0.01 # Gain value for network initialization. |
| 32 | + |
| 33 | +representation_hidden_size: [64, ] # A list of hidden units for each layer of Basic_MLP representation networks. |
| 34 | +actor_hidden_size: [64, ] # A list of hidden units for each layer of actor network. |
| 35 | +critic_hidden_size: [64, ] # A list of hidden units for each layer of critic network. |
| 36 | +activation: "relu" # The activation function of each hidden layer. |
| 37 | +activation_action: "sigmoid" # The activation function for the last layer of the actor. |
| 38 | +use_parameter_sharing: True # If to use parameter sharing for all agents' policies. |
| 39 | +use_actions_mask: False # If to use actions mask for unavailable actions. |
| 40 | + |
| 41 | +seed: 1 # Random seed. |
| 42 | +parallels: 16 # The number of environments to run in parallel. |
| 43 | +buffer_size: 3200 # Number of the transitions (use_rnn is False), or the episodes (use_rnn is True) in replay buffer. |
| 44 | +n_epochs: 10 # Number of epochs to train. |
| 45 | +n_minibatch: 1 # Number of minibatch to sample and train. batch_size = buffer_size // n_minibatch. |
| 46 | +learning_rate: 0.0007 # Learning rate. |
| 47 | +weight_decay: 0 # The steps to decay the greedy epsilon. |
| 48 | + |
| 49 | +vf_coef: 0.5 # Coefficient factor for critic loss. |
| 50 | +ent_coef: 0.01 # Coefficient factor for entropy loss. |
| 51 | +target_kl: 0.25 # For MAPPO_KL learner. |
| 52 | +clip_range: 0.2 # The clip range for ratio in MAPPO_Clip learner. |
| 53 | +gamma: 0.99 # Discount factor. |
| 54 | + |
| 55 | +# tricks |
| 56 | +use_linear_lr_decay: False # If to use linear learning rate decay. |
| 57 | +end_factor_lr_decay: 0.5 # The end factor for learning rate scheduler. |
| 58 | +use_global_state: False # If to use global state to replace merged observations. |
| 59 | +use_value_clip: True # Limit the value range. |
| 60 | +value_clip_range: 0.2 # The value clip range. |
| 61 | +use_value_norm: True # Use running mean and std to normalize rewards. |
| 62 | +use_huber_loss: True # True: use huber loss; False: use MSE loss. |
| 63 | +huber_delta: 10.0 # The threshold at which to change between delta-scaled L1 and L2 loss. (For huber loss). |
| 64 | +use_advnorm: True # If to use advantage normalization. |
| 65 | +use_gae: True # Use GAE trick. |
| 66 | +gae_lambda: 0.95 # The GAE lambda. |
| 67 | +use_grad_clip: True # Gradient normalization. |
| 68 | +grad_clip_norm: 10.0 # The max norm of the gradient. |
| 69 | +clip_type: 1 # Gradient clip for Mindspore: 0: ms.ops.clip_by_value; 1: ms.nn.ClipByNorm(). |
| 70 | + |
| 71 | +running_steps: 10000000 # The total running steps. |
| 72 | +eval_interval: 100000 # The interval between every two trainings. |
| 73 | +test_episode: 5 # The episodes to test in each test period. |
| 74 | + |
| 75 | +log_dir: "logs/ippo/" |
| 76 | +model_dir: "models/ippo/" |
0 commit comments