gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[GNUnet-SVN] r31753 - gnunet/src/ats


From: gnunet
Subject: [GNUnet-SVN] r31753 - gnunet/src/ats
Date: Fri, 27 Dec 2013 18:34:34 +0100

Author: oehlmann
Date: 2013-12-27 18:34:34 +0100 (Fri, 27 Dec 2013)
New Revision: 31753

Modified:
   gnunet/src/ats/plugin_ats_ril.c
Log:
added option of softmax action selection strategy

Modified: gnunet/src/ats/plugin_ats_ril.c
===================================================================
--- gnunet/src/ats/plugin_ats_ril.c     2013-12-27 16:01:59 UTC (rev 31752)
+++ gnunet/src/ats/plugin_ats_ril.c     2013-12-27 17:34:34 UTC (rev 31753)
@@ -34,18 +34,20 @@
 #define RIL_INTERVAL_EXPONENT 10
 #define RIL_UTILITY_MAX (double) GNUNET_ATS_MaxBandwidth
 
-#define RIL_DEFAULT_STEP_TIME_MIN GNUNET_TIME_relative_multiply 
(GNUNET_TIME_UNIT_MILLISECONDS, 500)
-#define RIL_DEFAULT_STEP_TIME_MAX GNUNET_TIME_relative_multiply 
(GNUNET_TIME_UNIT_MILLISECONDS, 3000)
-#define RIL_DEFAULT_ALGORITHM RIL_ALGO_SARSA
-#define RIL_DEFAULT_DISCOUNT_BETA 1.0
-#define RIL_DEFAULT_DISCOUNT_GAMMA 0.5
-#define RIL_DEFAULT_GRADIENT_STEP_SIZE 0.1
-#define RIL_DEFAULT_TRACE_DECAY 0.5
-#define RIL_DEFAULT_EXPLORE_RATIO 0.1
-#define RIL_DEFAULT_DIVISOR 10
+#define RIL_DEFAULT_STEP_TIME_MIN       GNUNET_TIME_relative_multiply 
(GNUNET_TIME_UNIT_MILLISECONDS, 500)
+#define RIL_DEFAULT_STEP_TIME_MAX       GNUNET_TIME_relative_multiply 
(GNUNET_TIME_UNIT_MILLISECONDS, 3000)
+#define RIL_DEFAULT_ALGORITHM           RIL_ALGO_SARSA
+#define RIL_DEFAULT_SELECT              RIL_SELECT_EGREEDY
+#define RIL_DEFAULT_DISCOUNT_BETA       1.0
+#define RIL_DEFAULT_DISCOUNT_GAMMA      0.5
+#define RIL_DEFAULT_GRADIENT_STEP_SIZE  0.1
+#define RIL_DEFAULT_TRACE_DECAY         0.5
+#define RIL_DEFAULT_EXPLORE_RATIO       0.1
+#define RIL_DEFAULT_DIVISOR             10
 #define RIL_DEFAULT_GLOBAL_REWARD_SHARE 0.5
+#define RIL_DEFAULT_TEMPERATURE         1.0
 
-#define RIL_INC_DEC_STEP_SIZE 1
+#define RIL_INC_DEC_STEP_SIZE           1
 
 /**
  * ATS reinforcement learning solver
@@ -79,6 +81,12 @@
   RIL_ALGO_Q = 1
 };
 
+enum RIL_Select
+{
+  RIL_SELECT_EGREEDY,
+  RIL_SELECT_SOFTMAX
+};
+
 enum RIL_E_Modification
 {
   RIL_E_SET,
@@ -118,11 +126,21 @@
   double lambda;
 
   /**
+   * Softmax action-selection temperature
+   */
+  double temperature;
+
+  /**
    * State space divisor
    */
   unsigned long long int divisor;
 
   /**
+   * Action selection strategy;
+   */
+  enum RIL_Select select;
+
+  /**
    * Ratio, with what probability an agent should explore in the e-greed policy
    */
   double explore_ratio;
@@ -1176,7 +1194,88 @@
   }
 }
 
+static int
+agent_select_egreedy (struct RIL_Peer_Agent *agent, double *state)
+{
+  if (agent_decide_exploration(agent))
+  {
+    if (RIL_ALGO_Q == agent->envi->parameters.algorithm)
+    {
+      agent_modify_eligibility(agent, RIL_E_ZERO, NULL);
+    }
+    return agent_get_action_explore(agent, state);
+  }
+  else
+  {
+    if (RIL_ALGO_Q == agent->envi->parameters.algorithm)
+    {
+      agent_modify_eligibility(agent, RIL_E_SET, NULL);
+    }
+    return agent_get_action_best(agent, state);
+  }
+}
+
 /**
+ * Selects the next action with a probability corresponding to its value. The
+ * probability is calculated using a Boltzmann distribution with a temperature
+ * value. The higher the temperature, the more are the action selection
+ * probabilities the same. With a temperature of 0, the selection is greedy,
+ * i.e. always the action with the highest value is chosen.
+ * @param agent
+ * @param state
+ * @return
+ */
+static int
+agent_select_softmax (struct RIL_Peer_Agent *agent, double *state)
+{
+  int i;
+  double eqt[agent->n];
+  double p[agent->n];
+  double sum = 0;
+  double r;
+
+  if (RIL_ALGO_Q == agent->envi->parameters.algorithm)
+  {
+    agent_modify_eligibility(agent, RIL_E_SET, NULL);
+  }
+
+  for (i=0; i<agent->n; i++)
+  {
+    eqt[i] = exp(agent_estimate_q(agent,state,i) / 
agent->envi->parameters.temperature);
+    sum += eqt[i];
+  }
+  for (i=0; i<agent->n; i++)
+  {
+    p[i] = eqt[i]/sum;
+  }
+  r = (double) GNUNET_CRYPTO_random_u32 (GNUNET_CRYPTO_QUALITY_WEAK,
+      UINT32_MAX) / (double) UINT32_MAX;
+  sum = 0;
+  for (i=0; i<agent->n; i++)
+  {
+    if (sum + p[i] > r)
+    {
+      return i;
+    }
+    sum += p[i];
+  }
+  GNUNET_assert(GNUNET_NO);
+}
+
+static int
+agent_select_action (struct RIL_Peer_Agent *agent, double *state)
+{
+  if (agent->envi->parameters.select == RIL_SELECT_EGREEDY)
+  {
+    return agent_select_egreedy(agent, state);
+  }
+  else
+  {
+    return agent_select_softmax(agent, state);
+  }
+}
+
+/**
  * Performs one step of the Markov Decision Process. Other than in the 
literature the step starts
  * after having done the last action a_old. It observes the new state s_next 
and the reward
  * received. Then the coefficient update is done according to the SARSA or 
Q-learning method. The
@@ -1188,7 +1287,7 @@
 agent_step (struct RIL_Peer_Agent *agent)
 {
   int a_next = RIL_ACTION_INVALID;
-  int explore;
+  int a_max;
   double *s_next;
   double reward;
 
@@ -1198,44 +1297,27 @@
 
   s_next = envi_get_state (agent->envi, agent);
   reward = envi_get_reward (agent->envi, agent);
-  explore = agent_decide_exploration (agent);
 
   switch (agent->envi->parameters.algorithm)
   {
   case RIL_ALGO_SARSA:
-    if (explore)
-    {
-      a_next = agent_get_action_explore (agent, s_next);
-    }
-    else
-    {
-      a_next = agent_get_action_best (agent, s_next);
-    }
+    a_next = agent_select_action (agent, s_next);
     if (RIL_ACTION_INVALID != agent->a_old)
     {
       //updates weights with selected action (on-policy), if not first step
       agent_update_weights (agent, reward, s_next, a_next);
-      agent_modify_eligibility (agent, RIL_E_SET, s_next);
     }
+    agent_modify_eligibility (agent, RIL_E_SET, s_next);
     break;
 
   case RIL_ALGO_Q:
-    a_next = agent_get_action_best (agent, s_next);
+    a_max = agent_get_action_best (agent, s_next);
     if (RIL_ACTION_INVALID != agent->a_old)
     {
       //updates weights with best action, disregarding actually selected 
action (off-policy), if not first step
-      agent_update_weights (agent, reward, s_next, a_next);
+      agent_update_weights (agent, reward, s_next, a_max);
     }
-    if (explore)
-    {
-      a_next = agent_get_action_explore (agent, s_next);
-      agent_modify_eligibility (agent, RIL_E_ZERO, NULL);
-    }
-    else
-    {
-      a_next = agent_get_action_best (agent, s_next);
-      agent_modify_eligibility (agent, RIL_E_SET, s_next);
-    }
+    a_next = agent_select_action (agent, s_next);
     break;
   }
 
@@ -1798,6 +1880,15 @@
   {
     solver->parameters.algorithm = RIL_DEFAULT_ALGORITHM;
   }
+  if (GNUNET_OK == GNUNET_CONFIGURATION_get_value_string (env->cfg, "ats", 
"RIL_SELECT", &string))
+  {
+    solver->parameters.select = !strcmp (string, "EGREEDY") ? 
RIL_SELECT_EGREEDY : RIL_SELECT_SOFTMAX;
+    GNUNET_free (string);
+  }
+  else
+  {
+    solver->parameters.select = RIL_DEFAULT_SELECT;
+  }
   if (GNUNET_OK == GNUNET_CONFIGURATION_get_value_string (env->cfg, "ats", 
"RIL_DISCOUNT_BETA", &string))
   {
     solver->parameters.beta = strtod (string, NULL);
@@ -1883,6 +1974,20 @@
   {
     solver->parameters.reward_global_share = RIL_DEFAULT_GLOBAL_REWARD_SHARE;
   }
+  if (GNUNET_OK == GNUNET_CONFIGURATION_get_value_string (env->cfg, "ats", 
"RIL_TEMPERATURE", &string))
+  {
+    solver->parameters.temperature = strtod (string, NULL);
+    GNUNET_free (string);
+    if (solver->parameters.temperature <= 0)
+    {
+      LOG (GNUNET_ERROR_TYPE_WARNING, "RIL_TEMPERATURE not positive. Set to 
default value of %f instead.\n", RIL_DEFAULT_TEMPERATURE);
+      solver->parameters.temperature = RIL_DEFAULT_TEMPERATURE;
+    }
+  }
+  else
+  {
+    solver->parameters.temperature = RIL_DEFAULT_TEMPERATURE;
+  }
   if (GNUNET_OK != GNUNET_CONFIGURATION_get_value_number (env->cfg, "ats", 
"RIL_SIMULATE", &solver->simulate))
   {
     solver->simulate = GNUNET_NO;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]