model-based-rl/SkeletonEnvironment.cpp at master · suhasjs/model-based-rl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
/*
*/

#include <string.h> /*strcmp*/
#include <stdio.h> /*printf*/
#include <stdlib.h>
#include <time.h> /*for time()*/

#include "utils.h"
#include <rlglue/Environment_common.h>/* env_ function prototypes and RL-Glue types */
#include <rlglue/utils/C/RLStruct_util.h> /* helpful functions for allocating structs and cleaning them up */
#include "putils.h"

#include "generator/psr/psr.h"

/* This is a very simple environment with discrete observations corresponding to states labeled {0,1,...,19,20}
   The starting state is 10.

   There are 2 actions = {0,1}.  0 decrements the state, 1 increments the state.

   The problem is episodic, ending when state 0 or 20 is reached, giving reward -1 or +1, respectively.  The reward is 0 on
   all other steps.
   */

observation_t this_observation;
reward_observation_terminal_t this_reward_observation;

// Current X,Y states.
int cX = 0;
int cY = 0;


#define TERM_X 11
#define TERM_Y 11

#define SIZE_X 12
#define SIZE_Y 12

#define VISIBILITY_KERNEL 3

// Action effects.
//int action_e[][2] = { {0,0}, {0,-1}, {0,1}, {-1,0}, {1,0} };

const int action_e[][2] = {{0,0},{0,1},{0,-1},{-1,0},{1,0}};

Generator* generator;

Field* field;

const char* env_init()
{
    char* task_spec="VERSION RL-Glue-3.0 PROBLEMTYPE episodic DISCOUNTFACTOR 1.0 OBSERVATIONS INTS (0 20) ACTIONS INTS (0 1)  REWARDS (-1.0 1.0)  EXTRA skeleton_environment(C/C++) by Brian Tanner.";

    /* Allocate the observation variable */
    allocateRLStruct(&this_observation,2 + ( 2 * VISIBILITY_KERNEL + 1 ) * ( 2 * VISIBILITY_KERNEL + 1 ) ,0,0);

    this_reward_observation.observation = &this_observation; // 0 observation.
    this_reward_observation.reward = 0; // 0 reward initially.
    this_reward_observation.terminal = 0; // Not terminal state.

    generator = new PSR( SIZE_X, SIZE_Y, SIZE_X/2, SIZE_Y/2 );
    field = new Field( SIZE_X, SIZE_Y );

    return task_spec;
}

void populate_visible_kernel( const observation_t* ob ){

    dbg(1, "VISIBILITY KERNEL: \n");
    int k = 0;
    for( int i = -VISIBILITY_KERNEL; i < VISIBILITY_KERNEL + 1; i ++ ){
        for( int j = -VISIBILITY_KERNEL; j < VISIBILITY_KERNEL + 1; j ++ ){

            if( cX + i >= SIZE_X || cX + i < 0 || cY + j >= SIZE_Y || cY + j < 0 ){
                ob->intArray[2 + k++] = 0;
                dbg(1, "- ");

                continue;
            }

            ob->intArray[2 + k++] = field->items[(cX + i) * SIZE_Y + cY + j];
            dbg(1, "%d ", ob->intArray[2 + k-1] );
        }
        dbg(1,"\n");
    }

}

const observation_t *env_start()
{

    cX = 0;
    cY = 0;
    this_observation.intArray[0] = cX;
    this_observation.intArray[1] = cY;

    generator->generate( field );

    populate_visible_kernel( &this_observation );

    return &this_observation;

}

const reward_observation_terminal_t *env_step(const action_t *this_action)
{

    int action = this_action->intArray[0];
    dbg(2,"\nX,Y: %d, %d\n", cX, cY);
    dbg(2,"ACTION: %d\n", action);
    int episode_over = 0;

    dbg( 2, "ACTUAL:\n");
    //printVisibleState( field->items, SIZE_X, SIZE_Y );
    // Do Action.
    int oldX = cX;
    int oldY = cY;
    cX += action_e[action][0];
    cY += action_e[action][1];

    // Bounds checking..
    if( cY >= SIZE_Y )
        cY = SIZE_Y-1;
    else if( cY < 0 )
        cY = 0;

    if( cX >= SIZE_X )
        cX = SIZE_X-1;
    else if( cX < 0 )
        cX = 0;

    if( field->items[ cX * SIZE_Y + cY ] ){
        cX = oldX;
        cY = oldY;
    }

    // Update reward_observation struct
    this_reward_observation.observation->intArray[0] = cX;
    this_reward_observation.observation->intArray[1] = cY;

    populate_visible_kernel( this_reward_observation.observation );

    if( (TERM_X == cX) && (TERM_Y == cY) ){
        episode_over = 1;
    }

    if( episode_over )
        this_reward_observation.reward = 10;
    else
        this_reward_observation.reward = -0.3; // Negative reward per step so that it takes the shortest path.

    this_reward_observation.terminal = episode_over;

    return &this_reward_observation;
}

void env_cleanup()
{
    dbg(3,"Cleaning up..\n");
    clearRLStruct(&this_observation);
    delete field;
}

const char* env_message(const char* inMessage) {
    if(strcmp(inMessage,"what is your name?")==0)
        return "I AM LEGEND. Now shut up and play.";

    return ":P";
}