gnugo-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnugo-devel] extract_fuseki patch


From: Douglas Ridgway
Subject: [gnugo-devel] extract_fuseki patch
Date: Thu, 14 Oct 2004 18:10:02 -0600 (MDT)

Changes to extract_fuseki:
  o statistics on outcomes
  o popularity cutoffs added to command line parameters
  o tweaks and bugfixes in game selection

Example output, from 35k 1d+ amateur even fullboard games mostly taken
from Ashley Feniello's collection, is at
http://dridgway.com/Go/fuseki19-0.db.gz . It's about 20k patterns, so
MAXPATNO in mkpat.c needs increasing in order to compile it.

The patch to sgfnode.c overlaps the printsgf - patch I submitted earlier.

doug.

----

Index: sgf/sgfnode.c
===================================================================
RCS file: /cvsroot/gnugo/gnugo/sgf/sgfnode.c,v
retrieving revision 1.27
diff -u -w -u -r1.27 sgfnode.c
--- sgf/sgfnode.c       25 Jan 2004 21:38:49 -0000      1.27
+++ sgf/sgfnode.c       14 Oct 2004 00:18:40 -0000
@@ -1147,6 +1147,7 @@
 
   if (sgferr) {
     fprintf(stderr, "Parse error: %s at position %d\n", sgferr, sgferrpos);
+    sgfFreeNode(root);
     return NULL;
   }
 
@@ -1502,7 +1503,7 @@
 
   sgf_column = 0;
   unparse_game(outfile, root, 1);
-  fclose(outfile);
+  if (outfile != stdout) fclose(outfile);
   
   /* Remove "printed" marks so that the tree can be written multiple
    * times.

Index: patterns/extract_fuseki.c
===================================================================
RCS file: /cvsroot/gnugo/gnugo/patterns/extract_fuseki.c,v
retrieving revision 1.23
diff -u -w -u -r1.23 extract_fuseki.c
--- patterns/extract_fuseki.c   25 May 2004 03:14:03 -0000      1.23
+++ patterns/extract_fuseki.c   14 Oct 2004 23:59:01 -0000
@@ -65,17 +65,53 @@
  * value on the colon line for use by the fuseki module.
  */
 
+/*
+ * Notes on the statistics:
+ * 
+ * The statistics code assumes that every input file is valid. Use
+ * the output file option to sort out which input files are valid, and
+ * check output for problems. Rerun after fixing/removing invalid files.
+ *
+ * Outcome is defined by RE in sgf. Files without a parsable RE, or which
+ * do not have a winner, are invalid and need to be excluded.
+ * 
+ * Pearson chi squared at 5% is used to test significance of
+ * differences among moves at a given position. Moves excluded by
+ * popularity rules are grouped together and considered as one.  A
+ * positive result means that among all possible moves in a position,
+ * there's a difference somewhere. The next question is where. One
+ * clue comes from dchisq, which is the contribution to the overall
+ * chi squared for each move, with larger meaning higher impact on
+ * significance of overall result. Another comes from post hoc tests.
+ * Each pair of moves from a position with a statistically significant
+ * impact of move choice is compared, again with Pearson chi squared
+ * at 5%, and the positive tests printed. No correction is done for
+ * multiple tests. Pairs with less than 6 total moves are not tested,
+ * so it's possible for there to be a significant overall result
+ * without any positive post hocs. In this case, the overall result is
+ * doubtful as well. 
+ *
+ * If the interest is solely in statistics, using min_pos_freq to
+ * avoid positions without enough data to hope for significance makes
+ * sense: 6 at a minimum. 
+ *
+ * Note that the popularity exclusion rules can result in patterns being
+ * left in the db which have no parent in the db.
+ * 
+ */
+
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <limits.h>
+#include <math.h>
 #include "liberty.h"
 #include "gg_utils.h"
 #include "random.h"
 #include "../sgf/sgftree.h"
 
 #define USAGE "\n\
-Usage: extract_fuseki files boardsize moves patterns handicap strength 
half_board [output file]\n\
+Usage: extract_fuseki files boardsize moves patterns handicap strength 
half_board min_pos_freq min_move_percent min_move_freq [output file]\n\
 files:     The name of a file listing sgf files to examine,\n\
            one filename per line.\n\
 boardsize: Only consider games with this size.\n\
@@ -85,6 +121,11 @@
            10 any handicap game\n\
 strength:  The lowest strength of the players (1k-30k)\n\
 half_board: 0 - full board patterns, 1 - half board patterns\n\
+min_pos_freq: how many times a position must occur before patterns\n\
+           from it are generated\n\
+min_move_percent: minimum popularity of a move in a given position before it\n\
+           gets a pattern\n\
+min_move_freq: minimum number of times a move must occur before it gets a 
pattern\n\
 output file: Optional (if this exists, extract_fuseki will sort the games 
instead)\n\
 "
 
@@ -106,12 +147,35 @@
 /* Lowest strength, given as argument.*/
 int player_strength;
 
+
+/* min # of times a position must be seen before moves from it become
+   patterns 
+   might want this larger to ensure reasonable statistics, 6 or more, say
+   or smaller to hit every move down to unique games, 2 say;
+   or even keep churning out moves with 1
+
+   given as argument
+*/
+
+int min_position_freq;
+
+
+/* popularity arguments */
+double min_move_percent;
+int min_move_freq;
+
+
 /* Number of games to analyze. */
 int number_of_games;
 
 /* The number of games that could not be used for some reason. */
 int *unused_games;
 
+/* WARN 1 warns about unused games */
+/* WARN 2 also notes assumptions about metainfo */
+#define WARN 1
+
+
 /* Dynamically allocated list of sgf file names. */
 char **sgf_names;
 
@@ -141,6 +205,7 @@
 struct situation {
   struct invariant_hash pre;
   struct invariant_hash post;
+  int outcome;
 };
 
 /* Dynamically allocated table of situations encountered in the analysis. */
@@ -153,6 +218,7 @@
 struct frequency {
   int index;
   int n;
+  int n_win;
 };
 
 /* Position frequency table. */ 
@@ -166,6 +232,8 @@
   int index;
   int position_frequency;
   int move_frequency;
+  int position_success;
+  int move_success;
   char pattern[MAX_BOARD][MAX_BOARD];
 };
 
@@ -173,8 +241,19 @@
 struct winner *winning_moves;
 int number_of_winning_moves;
 
+/* critical values of chisquare distribution with n degrees of freedom */
+/* p < 0.05 */
+double chisquarecrit05[] = {3.8415, 5.9915, 7.8147, 9.4877, 11.0705, 12.5916, 
14.0671, 15.5073, 16.9190, 18.3070, 19.6751, 21.0261, 22.3620, 23.6848, 
24.9958, 26.2962, 27.5871, 28.8693, 30.1435, 31.4104, 32.67057, 33.92444, 
35.17246, 36.41503, 37.65248, 38.88514, 40.11327, 41.33714, 42.55697, 43.77297, 
44.98534, 46.19426, 47.39988, 48.60237, 49.80185, 50.99846, 52.19232, 53.38354, 
54.57223, 55.75848, 56.94239, 58.12404, 59.30351, 60.48089,  61.65623, 
62.82962, 64.00111, 65.17077, 66.33865, 67.50481 };
+
+/* p < 0.10, should be same size as 05 */
+double chisquarecrit10[] = {2.7055, 4.6052, 6.2514, 7.7794, 9.2364, 10.6446, 
12.0170, 13.3616, 14.6837, 15.9872, 17.2750, 18.5493, 19.8119, 21.0641, 
22.3071, 23.5418, 24.7690, 25.9894, 27.2036, 28.4120,29.61509, 30.81328, 
32.00690, 33.19624, 34.38159, 35.56317, 36.74122, 37.91592, 39.08747, 40.25602, 
41.42174, 42.58475, 43.74518, 44.90316, 46.05879, 47.21217, 48.36341, 49.51258, 
50.65977, 51.80506, 52.94851, 54.09020, 55.23019, 56.36854, 57.50530, 58.64054, 
59.77429, 60.90661, 62.03754, 63.16712 };
+
+double chisquarecrit01[] = 
{6.63489660102121,9.21034037197618,11.3448667301444,13.2767041359876,15.086272469389,16.8118938297709,18.4753069065824,20.0902350296632,21.6659943334619,23.2092511589544,24.7249703113183,26.2169673055359,27.6882496104570,29.1412377406728,30.5779141668925,31.9999269088152,33.4086636050046,34.8053057347051,36.1908691292701,37.5662347866250,38.9321726835161,40.2893604375938,41.6383981188585,42.9798201393516,44.3141048962192,45.6416826662832,46.9629421247514,48.2782357703155,49.5878844728988,50.8921813115171,52.1913948331919,53.4857718362354,54.7755397601104,56.0609087477891,57.3420734338592,58.619214501687,59.8925000450869,61.1620867636897,62.4281210161849,63.6907397515645,64.9500713352112,66.2062362839932,67.4593479223258,68.7095129693454,69.9568320658382,71.2014002483115,72.4433073765482,73.6826385201058,74.9194743084782,76.1538912490127};
+
+double chisquarecrit001[] = 
{10.8275661706627,13.8155105579643,16.2662361962381,18.4668269529032,20.5150056524329,22.4577444848253,24.3218863478569,26.1244815583761,27.8771648712566,29.5882984450744,31.26413362024,32.9094904073602,34.5281789748709,36.1232736803981,37.6972982183538,39.2523547907685,40.7902167069025,42.31239633168,43.8201959645175,45.3147466181259,46.7970380415613,48.2679422908352,49.7282324664315,51.1785977773774,52.6196557761728,54.0519623885766,55.4760202057452,56.8922853933536,58.3011734897949,59.7030643044299,61.0983060810581,62.4872190570885,63.870098522345,65.2472174609424,66.618828843701,67.9851676260242,69.3464524962412,70.702887411505,72.0546629519878,73.401957518991,74.7449383984238,76.0837627077,77.418578241314,78.749524228043,80.076732010819,81.40032565871,82.720422519124,84.0371337172235,85.350564608593,86.6608151904032};
+
 /*
- * Write the files that are sorted to a specific file
+ * Append the files that are sorted to a specific file
  */
 
 static void
@@ -423,10 +502,11 @@
 
 /* Add a situation to the situation_table array. */
 static void
-add_situation(struct invariant_hash *pre, struct invariant_hash *post)
+add_situation(struct invariant_hash *pre, struct invariant_hash *post, int 
outcome)
 {
   situation_table[number_of_situations].pre = *pre;
   situation_table[number_of_situations].post = *post;
+  situation_table[number_of_situations].outcome = outcome;
   number_of_situations++;
 }
 
@@ -548,7 +628,7 @@
   s.post = *post;
 
   for (k = 0; k < number_of_winning_moves; k++) {
-    if (compare_situations(&situation_table[winning_moves[k].index],
+    if (winning_moves[k].index != -1 && 
compare_situations(&situation_table[winning_moves[k].index],
                           &s) == 0) {
       /* This is a winner. Record the pattern. */
       for (i = 0; i < board_size; i++)
@@ -616,12 +696,12 @@
 }
 
 /* Play through the initial moves of a game. If 'collect_statistics'
- * is one, store all encounterd situations in the situation_table
- * array. Otherwise, see if there are any winners among the situations
+ * is set, store all encountered situations in the situation_table
+ * array. 'collect_statistics' will be set to the color which won the
+ * game.  Otherwise, see if there are any winners among the situations
  * and store the corresponding pattern so that it can subsequently be
  * printed. Return 0 if there was some problem with the game record,
- * e.g. fewer moves than expected.
- */
+ * e.g. fewer moves than expected.  */
 static int
 examine_game(SGFNode *sgf, int collect_statistics)
 {
@@ -649,8 +729,8 @@
     gg_assert(m >= 0 && m < board_size && n >= 0 && n <= board_size);
     hash_board(&prehash, color);
     hash_board_and_move(&posthash, color, m, n);
-    if (collect_statistics)
-      add_situation(&prehash, &posthash);
+    if (collect_statistics != EMPTY)
+      add_situation(&prehash, &posthash, collect_statistics == color);
     else
       store_pattern_if_winner(&prehash, &posthash, color, m, n);
     play_move(POS(m, n), color);
@@ -691,9 +771,10 @@
     return 1;
   
   length = strlen(strength);
-  /* check if dan player */
+  /* check if dan or pro player */
   for (i = 0; i < length; i++)
-    if (strength[i] == 'd')
+    if (strength[i] == 'd' || strength[i] == 'D' 
+       || strength[i] == 'p' || strength[i] == 'P')
       return 1;
   
   /* get the kyu strength as an integer */
@@ -701,6 +782,13 @@
     if (strength[i] == 'k')
       strength[i] = '\0';
     kyu = atoi(strength);
+    if (kyu == 0) {
+      if (player_strength >= 30)
+       return 1;
+      else
+       return 0;
+    }
+
   }
   
   if (kyu <= player_strength)
@@ -710,89 +798,138 @@
   return 0;
 }
 
-/*
- * Sort out the games that can be used
- */
-
-static void
-sort_games(void)
-{
-  int k;
-  int handicap;
-  char *WR; /* white rank */
-  char *BR; /* black rank */
-  
-  for (k = 0; k < number_of_games; k++) {
-    int size;
-    SGFNode *sgf;
-    
-    /* Progress output. */
-    if (k % 500 == 0)
-      fprintf(stderr, "Sorting number %d, %s\n", k, sgf_names[k]);
-    
-    sgf = readsgffilefuseki(sgf_names[k], 0);
-    
-    if (!sgf) {
-      if (0)
-       fprintf(stderr, "Warning: Couldn't open sgf file %s.\n", sgf_names[k]);
-      unused_games[k] = 1; /* the game could not be used */
-      continue;
-    }
     
-    else if (!sgfGetIntProperty(sgf, "SZ", &size) || size != board_size) {
-      if (0)
-       fprintf(stderr, "Warning: wrong size of board %d.\n", size);
-      unused_games[k] = 1; /* the game could not be used */
-      continue; /* Board of wrong size, ignore the game. */
+/* 
+ * used by both sort_games and collect_situations to
+ * check validity of a game record
+ * 0 means failure for any reason
+ * 1 means probably okay, without going through moves
+ */
+static int check_game(SGFNode *sgf, char *sgfname) {
+  int handicap, size;
+  char *WR, *BR; /* white rank */
+  char thirtyk[] = "30k";
+  char *RE;
+  
+    size = 19;
+    if (!sgfGetIntProperty(sgf, "SZ", &size)) {
+      if (WARN>1)
+       fprintf(stderr, "Warning: no SZ in sgf file %s , assuming size %d\n", 
sgfname, size);
+    }
+    if (size != board_size) {
+      if (WARN)
+       fprintf(stderr, "Warning: wrong size of board %d in sgf file %s .\n", 
size, sgfname);
+      return 0;
     }
     
     /* No handicap games */
-    else if (handicap_value == 0) {
+    if (handicap_value == 0) {
       if (sgfGetIntProperty(sgf, "HA", &handicap) && handicap > 1) {
-       if (0)
-         fprintf(stderr, "No handicap games allowed %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
+       if (WARN)
+         fprintf(stderr, "No handicap games allowed, sgf file %s has handicap 
%d\n", sgfname, handicap);
+      return 0;
       }
     }
     
     /* Only handicap games */
-    else if (handicap_value > 1) {
+    if (handicap_value > 1) {
       if (!sgfGetIntProperty(sgf, "HA", &handicap)) {
-       if (0)
-         fprintf(stderr, "Not a handicap game %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
+       if (WARN)
+         fprintf(stderr, "Sgf file %s is not a handicap game\n", sgfname);
+       return 0;
       }
       
       /* only specific handicap games */
-      else if (handicap_value != 10 && handicap != handicap_value) {
-       if (0)
-         fprintf(stderr, "Wrong number of handicap stones %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
+      if (handicap_value != 10 && handicap != handicap_value) {
+       if (WARN)
+         fprintf(stderr, "Sgf file %s has wrong number of handicap stones 
%d\n", sgfname, handicap);
+       return 0;
       }
+
+      /* any reasonable handicap games */
+      if (handicap_value == 10 && (handicap < 2 || handicap > 9)) {
+       if (WARN)
+         fprintf(stderr, "Sgf file %s has wrong/weird number of handicap 
stones %d\n", sgfname, handicap);
+       return 0;
     }
     
-    if (unused_games[k] == 0) {
+    }
+    
+
       /* examine strength of players in the game and compare it 
        * with minimum player strength 
        */
-      if (sgfGetCharProperty(sgf, "BR", &BR) && !enough_strength(BR)) {
-       if (0)
-         fprintf(stderr, "Wrong strength: %s.\n", BR);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
+
+
+    BR = thirtyk;
+      if (!sgfGetCharProperty(sgf, "BR", &BR)) {
+       if (WARN>1)
+         fprintf(stderr, "No black strength in sgf file %s assuming %s\n", 
sgfname, BR);
+      }
+      if (!enough_strength(BR)) {
+       if (WARN)
+         fprintf(stderr, "Wrong black strength %s in sgf file %s\n", BR, 
sgfname);
+       return 0;
       }
       
-      /* examine strength of players in the game and compare it with 
-       * minimum player strength */
-      else if (sgfGetCharProperty(sgf, "WR", &WR) && !enough_strength(WR)) {
-       if (0)
-         fprintf(stderr, "Wrong strength: %s.\n", WR);
+    WR = thirtyk;
+      if (!sgfGetCharProperty(sgf, "WR", &WR)) {
+       if (WARN>1)
+         fprintf(stderr, "No white strength in sgf file %s assuming %s\n", 
sgfname, WR);
+      }
+      if (!enough_strength(WR)) {
+       if (WARN)
+         fprintf(stderr, "Wrong white strength %s in sgf file %s\n", WR, 
sgfname);
+       return 0;
+      }
+
+    /* must have result */
+    if (!sgfGetCharProperty(sgf, "RE", &RE)) {
+      if (WARN)
+       fprintf(stderr, "No result in game %s\n", sgfname);
+      return 0;
+    }
+    if (strncmp(RE, "B+", 2)!=0  && strncmp(RE, "W+", 2)!=0) {
+
+      if (WARN)
+       fprintf(stderr, "Couldn't parse winner in result %s from game %s\n", 
RE, sgfname);
+      return 0;
+    }
+
+
+      /* looks okay */
+      return 1;
+}
+
+/*
+ * Sort out the games that can be used
+ */
+
+static void
+sort_games(void)
+{
+  int k;
+
+  for (k = 0; k < number_of_games; k++) {
+    SGFNode *sgf;
+    
+    /* Progress output. */
+    if (k % 500 == 0)
+      fprintf(stderr, "Sorting number %d, %s\n", k, sgf_names[k]);
+    
+    sgf = readsgffilefuseki(sgf_names[k], 0);
+    
+    
+    if (!sgf) {
+      if (WARN)
+       fprintf(stderr, "Warning: Couldn't open sgf file %s number %d.\n", 
sgf_names[k], k);
        unused_games[k] = 1; /* the game could not be used */
        continue;
       }
+
+    if (!check_game(sgf, sgf_names[k])) {
+      unused_games[k] = 1;
+      continue;
     }
     
     /* Free memory of SGF file */
@@ -808,14 +945,13 @@
 collect_situations(void)
 {
   int k;
-  int handicap;
-  char *WR; /* white rank */
-  char *BR; /* black rank */
+  int winner; /* who won the game in question */
   
   init_situations();
   for (k = 0; k < number_of_games; k++) {
     int size;
     SGFNode *sgf;
+    char *RE;
     
     /* Progress output. */
     if (k % 500 == 0)
@@ -824,66 +960,29 @@
     sgf = readsgffilefuseki(sgf_names[k], moves_per_game);
     
     if (!sgf) {
-      if (0)
+      if (WARN)
        fprintf(stderr, "Warning: Couldn't open sgf file %s.\n", sgf_names[k]);
       unused_games[k] = 1; /* the game could not be used */
       continue;
     }
-    
-    else if (!sgfGetIntProperty(sgf, "SZ", &size) || size != board_size) {
-      if (0)
-       fprintf(stderr, "Warning: wrong size of board %d.\n", size);
-      unused_games[k] = 1; /* the game could not be used */
-      continue; /* Board of wrong size, ignore the game. */
-    }
-    
-    /* No handicap games */
-    else if (handicap_value == 0) {
-      if (sgfGetIntProperty(sgf, "HA", &handicap) && handicap > 1) {
-       if (0)
-         fprintf(stderr, "No handicap games allowed %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
+    if (!check_game(sgf, sgf_names[k])) {
+      unused_games[k] = 1; 
        continue;
       }
-    }
     
-    /* Only handicap games */
-    else if (handicap_value > 1) {
-      if (!sgfGetIntProperty(sgf, "HA", &handicap)) {
-       if (0)
-         fprintf(stderr, "Not a handicap game %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
+    if (!sgfGetCharProperty(sgf, "RE", &RE)) {
+      gg_assert(0);
       }
-      
-      /* only specific handicap games */
-      else if (handicap_value != 10 && handicap != handicap_value) {
-       if (0)
-         fprintf(stderr, "Wrong number of handicap stones %d\n", handicap);
-       unused_games[k] = 1; /* the game could not be used */
-       continue;
-      }
-    }
-    
-    /* examine strength of players in the game and compare it with 
-     * minimum player strength */
-    if (sgfGetCharProperty(sgf, "BR", &BR) && !enough_strength(BR)) {
-      if (0)
-       fprintf(stderr, "Wrong strength: %s.\n", BR);
-      unused_games[k] = 1; /* the game could not be used */
-      continue;
-    }
-    /* examine strength of players in the game and compare it with 
-     * minimum player strength */
-    else if (sgfGetCharProperty(sgf, "WR", &WR) && !enough_strength(WR)) {
-      if (0)
-       fprintf(stderr, "Wrong strength: %s.\n", WR);
-      unused_games[k] = 1; /* the game could not be used */
-      continue;
+    if (strncmp(RE, "B+", 2)==0 ) {
+      winner = BLACK;
+    } else if (strncmp(RE, "W+", 2)==0 ) {
+      winner = WHITE;
+    } else {
+      gg_assert(0);
     }
     
-    if (!examine_game(sgf, 1)) {
-      if (0)
+    if (!examine_game(sgf, winner)) {
+      if (WARN)
        fprintf(stderr, "Warning: Problem with sgf file %s.\n", sgf_names[k]);
       unused_games[k] = 1; /* the game could not be used */
     }
@@ -932,9 +1031,12 @@
                                    &situation_table[k-1]) != 0) {
       frequency_table[number_of_distinct_positions].index = k;
       frequency_table[number_of_distinct_positions].n = 0;
+      frequency_table[number_of_distinct_positions].n_win = 0;
       number_of_distinct_positions++;
     }
     frequency_table[number_of_distinct_positions-1].n++;
+    frequency_table[number_of_distinct_positions-1].n_win += 
situation_table[k].outcome;
+    
   }
   
   /* Sort the frequency table, in falling order. */
@@ -960,8 +1062,7 @@
   
   /* Starting with the most common position, find the most common
    * moves for each position, until the number of patterns to be
-   * generated is reached. Don't include moves with a frequency
-   * smaller than a tenth of the most common move.
+   * generated is reached.
    */
   for (k = 0; k < number_of_distinct_positions; k++) {
     int index = frequency_table[k].index;
@@ -970,6 +1071,12 @@
     /* Build a new frequency table for the different moves in this position. */
     struct frequency move_frequencies[MAX_BOARD * MAX_BOARD];
     int number_of_moves = 0;
+
+    /* a position must occur a minimum before we analyze and 
+       record the moves from it */
+    if (frequency_table[k].n < min_position_freq) {
+      break;
+    }
     for (i = index; ;i++) {
       if (i == number_of_situations
          || (i > index
@@ -980,9 +1087,11 @@
                                           &situation_table[i-1]) != 0) {
        move_frequencies[number_of_moves].index = i;
        move_frequencies[number_of_moves].n = 0;
+       move_frequencies[number_of_moves].n_win = 0;
        number_of_moves++;
       }
       move_frequencies[number_of_moves-1].n++;
+      move_frequencies[number_of_moves-1].n_win += situation_table[i].outcome;
     }
     
     /* Sort the moves, in falling order. */
@@ -999,19 +1108,39 @@
     
     /* Add the moves to the list of winners. */
     for (i = 0; i < number_of_moves; i++) {
-      /* This is where the cut-off of moves is decided */
-      if (10 * move_frequencies[i].n < move_frequencies[0].n
-         && move_frequencies[i].n < 10)
-       break;
-      /* Take away any move that hasn't been made by at least 2 people */
-      if (move_frequencies[i].n < 2)
-       break;
+      /* This is where the cut-off of moves is decided 
+       *  based on popularity from command line arguments
+       */
+       
+      if (0.01*min_move_percent*move_frequencies[0].n > move_frequencies[i].n 
+         || move_frequencies[i].n < min_move_freq) {
+       winning_moves[number_of_winning_moves].index = -1;
+       winning_moves[number_of_winning_moves].position_frequency =
+         frequency_table[k].n;
+       winning_moves[number_of_winning_moves].move_frequency = 0;
+       winning_moves[number_of_winning_moves].position_success = 
frequency_table[k].n_win;
+       winning_moves[number_of_winning_moves].move_success = 0;
+       while (i<number_of_moves) {
+         gg_assert (0.01*min_move_percent*move_frequencies[0].n 
+                    > move_frequencies[i].n ||
+                    move_frequencies[i].n < min_move_freq);
+         winning_moves[number_of_winning_moves].move_frequency +=      
+           move_frequencies[i].n;
+         winning_moves[number_of_winning_moves].move_success +=
+           move_frequencies[i].n_win;
+         i++;
+       }
+       number_of_winning_moves++;
+       continue;
+      }
       
       winning_moves[number_of_winning_moves].index = move_frequencies[i].index;
       winning_moves[number_of_winning_moves].position_frequency =
        frequency_table[k].n;
       winning_moves[number_of_winning_moves].move_frequency =
        move_frequencies[i].n;
+      winning_moves[number_of_winning_moves].position_success = 
frequency_table[k].n_win;
+      winning_moves[number_of_winning_moves].move_success = 
move_frequencies[i].n_win;
       number_of_winning_moves++;
       
       if (number_of_winning_moves == patterns_to_extract)
@@ -1074,15 +1203,142 @@
 {
   int k, l;
   int m, n;
+  double chisq = 0.0;
+  int df = 0;
+  unsigned int pre = situation_table[winning_moves[0].index].pre.values[0];
+  int first_in_set = 0;
   l = 1;
   for (k = 0; k < number_of_winning_moves; k++) {
     
-    /* do not print errornous patterns */
-    if (winning_moves[k].pattern[0][0] != '\0') {
+    /* do not print erroneous patterns */
+    if (winning_moves[k].pattern[0][0] != '\0' || winning_moves[k].index == 
-1) {
+      double grand_sum = winning_moves[k].position_frequency;
+      double grand_wins = winning_moves[k].position_success;
+      double grand_losses = grand_sum - grand_wins;
+      double row_sum = winning_moves[k].move_frequency;
+      double wins =  winning_moves[k].move_success;
+      double losses = row_sum - wins;
+      double expect_wins = row_sum*grand_wins/grand_sum;
+      double expect_losses = row_sum - expect_wins;
+      /* we're _not_ using a Yates corrected chisquare */
+      /* two reasons: 1. It's never correct for > 2x2 table */
+      /*              2. Our marginals are sampled, not fixed, so
+       *                Yates and usual Fisher exact are wrong distribution
+       *  Straight chi squared is best.
+       */
+      double dchisq = 0.0;
+      /* this was Yates line. it's wrong 
+      if (expect_wins > 0.0) dchisq += pow(gg_abs(wins - 
expect_wins)-0.5,2)/expect_wins;
+      */
+      if (expect_wins > 0.0) dchisq += pow(wins - expect_wins,2)/expect_wins;
+      if (expect_losses > 0.0) dchisq  += pow(losses-expect_losses, 
2)/expect_losses;
+
+      /* did we just finish a set? If so, print totals and reset */
+      if (winning_moves[k].index != -1 && 
situation_table[winning_moves[k].index].pre.values[0] != pre) {
+       /* p-value is 1 - incomplete gamma function(d.o.f/2, chisq/2) */
+       /* variable df is number of moves, actual d.o.f is df-1 */
+       printf("\n### Summary of pattern pre 0x%08x\n### N Chi_squared df: %g 
%g %d ", pre, grand_sum, chisq, df-1);
+       /* and array is indexed at zero for d.o.f = 1... */
+       if (df-1 < 1) {
+         printf("NS\n\n");
+       } else if (df-1 < sizeof(chisquarecrit05)/sizeof(double)  && chisq > 
chisquarecrit05[df-2]) { 
+         /* The overall result is significant at 5%. In this case, at
+            least some authorities will allow us to examine several
+            individual contrasts w/o futher correction. We compare
+            every pair of moves, which is perhaps too many, but the
+            purpose is to give the human expert (who would in any
+            case be required to examine the output) some sense of
+            where the differences are. Something like a Bonferroni
+            correction could result in a significant test overall,
+            but no significant contrasts, which is obviously
+            nonsense. The significance of the overall result must
+            come from somewhere. */
+         int m,n;
+         if (chisq > chisquarecrit001[df-2]) 
+           printf("!!! p < 0.001\n");
+         else if (chisq > chisquarecrit01[df-2]) 
+           printf("!!! p < 0.01\n");
+         else
+           printf("!!! p < 0.05\n");
+         for (m=first_in_set; m<k; m++) {
+           for (n=m+1; n<k; n++) {
+             double grand_sum = winning_moves[m].move_frequency + 
winning_moves[n].move_frequency;
+             double grand_wins = winning_moves[m].move_success + 
winning_moves[n].move_success; 
+             double grand_losses = grand_sum - grand_wins;
+             double row_sum_m = winning_moves[m].move_frequency;
+             double row_sum_n = winning_moves[n].move_frequency;
+
+             double wins_m =  winning_moves[m].move_success;
+             double losses_m = row_sum_m - wins_m;
+             double wins_n =  winning_moves[n].move_success;
+             double losses_n = row_sum_n - wins_n;
+
+             double expect_wins_m = row_sum_m*grand_wins/grand_sum;
+             double expect_losses_m = row_sum_m - expect_wins_m;
+             double expect_wins_n = row_sum_n*grand_wins/grand_sum;
+             double expect_losses_n = row_sum_n - expect_wins_n;
+             double dchisq_m = 0.0;
+             double dchisq_n = 0.0;
+             if (expect_wins_m > 0.0) dchisq_m += pow(wins_m - 
expect_wins_m,2)/expect_wins_m;
+             if (expect_losses_m > 0.0) dchisq_m  += 
pow(losses_m-expect_losses_m, 2)/expect_losses_m;
+             if (expect_wins_n > 0.0) dchisq_n += pow(wins_n - 
expect_wins_n,2)/expect_wins_n;
+             if (expect_losses_n > 0.0) dchisq_n  += 
pow(losses_n-expect_losses_n, 2)/expect_losses_n;
+             /* we demand at least N=6. nonsense with smaller N */
+             if (dchisq_m + dchisq_n > chisquarecrit05[0] && grand_sum > 5) {
+               printf("#### 0x%08x %c 0x%08x (p < 0.05) chisq = %g N = %g\n", 
+                      situation_table[winning_moves[m].index].post.values[0],
+                      (1.0*wins_m/row_sum_m > 1.0*wins_n/row_sum_n)? '>' : '<',
+                      situation_table[winning_moves[n].index].post.values[0],
+                      dchisq_m+dchisq_n, grand_sum);
+             }
+           }
+         }
+         printf("\n\n");
+
+       } else  if (df-1 < sizeof(chisquarecrit10)/sizeof(double)  && chisq > 
chisquarecrit10[df-2]) { 
+         printf("??? p < 0.10\n\n");
+       } else if (!(df-1 < sizeof(chisquarecrit05)/sizeof(double))) {
+         printf("df out of range...\n\n");
+       } else {
+         printf("NS\n\n");
+       }
+       pre = situation_table[winning_moves[k].index].pre.values[0];
+       first_in_set = k;
+       chisq = 0.0;
+       df = 0;
+      }
+      /* increment totals */
+      chisq += dchisq;
+      df++;
+
+      if (winning_moves[k].index == -1) {
+       printf("# Unpopular moves\n");
+       printf("# pre: hopefully same as before\n");
+       printf("# post: could be various\n");
+       printf("# frequency: %d/%d\n", 
+              winning_moves[k].move_frequency,
+              winning_moves[k].position_frequency);
+       printf("# wins: %d/%d\n\n", 
+              winning_moves[k].move_success,
+              winning_moves[k].position_success);
+       printf("# success: %.1f%% vs %.1f%% for this position overall, dchisq = 
%g\n\n", 
+              100.0*winning_moves[k].move_success / 
winning_moves[k].move_frequency,
+              100.0*winning_moves[k].position_success / 
winning_moves[k].position_frequency, dchisq);
+      } else {
       printf("Pattern Fuseki%d\n", l);
-      printf("# %d/%d\n\n", 
+       printf("# pre : 0x%08x\n", 
situation_table[winning_moves[k].index].pre.values[0]);
+       printf("# post: 0x%08x\n", 
situation_table[winning_moves[k].index].post.values[0]);
+       printf("# frequency: %d/%d\n", 
             winning_moves[k].move_frequency,
             winning_moves[k].position_frequency);
+       printf("# wins: %d/%d\n\n", 
+              winning_moves[k].move_success,
+              winning_moves[k].position_success);
+       printf("# success: %.1f%% vs %.1f%% for this position overall, dchisq = 
%g\n\n", 
+              100.0*winning_moves[k].move_success / 
winning_moves[k].move_frequency,
+              100.0*winning_moves[k].position_success / 
winning_moves[k].position_frequency, dchisq);
+       
+
       printf("+");
       for (n = 0; n < board_size; n++) {
        printf("-");
@@ -1108,6 +1364,11 @@
       printf("\n\n:8,-,value(%d)\n\n\n", winning_moves[k].move_frequency);
       l++;
     }
+    } else {
+      fprintf(stderr, "Skipping pattern pre 0x%08x post 0x%08x, stats may be 
wrong...\n", 
+             situation_table[winning_moves[k].index].pre.values[0], 
+             situation_table[winning_moves[k].index].post.values[0]);
+    }
   }
 }
 
@@ -1118,7 +1379,7 @@
   int i = 0;
   
   /* Check number of arguments. */
-  if (argc < 6) {
+  if (argc < 11) {
     printf(USAGE);
     exit(EXIT_FAILURE);
   }
@@ -1144,8 +1405,8 @@
            patterns_to_extract);
   
   handicap_value = atoi(argv[5]);
-  if (handicap_value < 0 || handicap_value > 2)
-    fprintf(stderr, "Warning: wrong handicap value: %d.\n",
+  if (handicap_value < 0 || handicap_value > 10)
+    fprintf(stderr, "Warning: unusual handicap value: %d.\n",
            handicap_value);
   
   player_strength = atoi(argv[6]);
@@ -1158,6 +1419,26 @@
     fprintf(stderr, "Warning: incorrect half_board_flag (0 or 1). Setting the 
value to 0.\n");
     half_board_patterns = 0;
   }
+
+  min_position_freq = atoi(argv[8]);
+  if (min_position_freq < 1) {
+    fprintf(stderr, "Warning: setting min_position_freq to 1\n");
+    min_position_freq = 1;
+  }
+
+  min_move_percent = atof(argv[9]);
+  if (min_move_percent < 0. || min_move_percent > 100.) {
+    fprintf(stderr, "Warning: strange min_move_percent %g, setting to 1%%\n", 
+           min_move_percent);
+    min_move_percent = 1.0;
+  }
+
+  min_move_freq = atoi(argv[10]);
+  if (min_move_freq < 0) {
+    fprintf(stderr, "Warning: strange min_move_freq %d\n", min_move_freq);
+  }
+
+
   /* Count the number of sgf files. */
   number_of_games = read_sgf_filenames(argv[1], NULL);
   
@@ -1179,9 +1460,9 @@
   (void) read_sgf_filenames(argv[1], sgf_names);
   
   /* Save memory by sorting out the games that can be used first */
-  if (argv[8] != NULL) {
+  if (argv[11] != NULL) {
     sort_games();
-    write_sgf_filenames(argv[8], sgf_names);
+    write_sgf_filenames(argv[11], sgf_names);
   }
   
   else {
@@ -1206,6 +1487,9 @@
     fprintf(stderr, "generate OK.\n");
 
     printf("attribute_map value_only\n\n\n");
+    printf("# ");
+    for (i=0; i<argc; i++) printf("%s ", argv[i]);
+    printf("\n\n\n");
 
     /* Write the patterns to stdout in patterns.db format.
      */





reply via email to

[Prev in Thread] Current Thread [Next in Thread]