Register    Login    Forum    Search    FAQ

Board index » FANN » Enhancing the C library




Post new topic Reply to topic  [ 1 post ] 
Author Message
 Post subject: Patch for merging larger datasets.
 Post Posted: Wed Oct 08, 2008 8:29 pm 
Offline

Joined: Fri Sep 12, 2008 9:07 pm
Posts: 3
This patch modifies src/fann_train_data.c and src/include/train_data.h.

It adds two functions. fann_merge_array_train_data and fann_dump_train_data.

The story behind them is that I'm using fann in some speech classification experiments. My data is all in phonetic transcriptions and feature tracks. The program I wrote has a layer to translate these into fann_train_data structures. I needed a way to merge them all into a single training set.

The first option I tried was to use fann_merge_train_data for each new fann_train_data structure produced from an utterance. However, with the number of training points I have, about 50,000 training tokens in approximately 5,000 utterances, this was too slow.

The second option is what is in the patch. It takes an array of fann_train_data structures and merges them recursively. It basically performs the merge of a mergesort in order to reduce the number of calls to fann_merge_train_data.

The second function, fann_dump_train_data is simply a debugging function to check that features are being translated correctly.

The patch was generated against the gsoc2007 branch of fann.

Code:
Index: src/fann_train_data.c
===================================================================
RCS file: /cvsroot/fann/fann/src/fann_train_data.c,v
retrieving revision 1.34.4.7
diff -u -p -r1.34.4.7 fann_train_data.c
--- src/fann_train_data.c   31 Aug 2008 11:16:24 -0000   1.34.4.7
+++ src/fann_train_data.c   7 Oct 2008 20:57:42 -0000
@@ -610,6 +610,86 @@ FANN_EXTERNAL struct fann_train_data *FA
    return dest;
 }
 
+/* Function: fann_merge_array_train_date

+   Returns <struct fann_train_data> that is the merge of all the elements in *data*
+ */
+FANN_EXTERNAL struct fann_train_data* FANN_API fann_merge_array_train_data(struct fann_train_data **data, int num_data)
+{
+  int mid_point;
+  struct fann_train_data **d1 = data;
+  struct fann_train_data **d2 = NULL;
+  int num_d1, num_d2;
+
+  struct fann_train_data *d1m = NULL;
+  struct fann_train_data *d2m = NULL;
+  struct fann_train_data *dm = NULL;
+
+  if (num_data == 1 && data[0]) {
+    /* we duplicate the data so that it can be
+       free later on up the call tree */
+    return fann_duplicate_train_data(data[0]);
+  } else if ((num_data == 1 && !data[0]) || num_data == 0) {
+    return NULL;
+  }
+
+  /* find mid point */
+  mid_point = num_data / 2;
+  if (num_data % 2 == 0) {
+    num_d1 = mid_point;   
+  } else {
+    num_d1 = mid_point + 1;     
+  }

+  d2 = data + num_d1;           
+  num_d2 = mid_point;
+
+  d1m = fann_merge_array_train_data(d1, num_d1);
+  d2m = fann_merge_array_train_data(d2, num_d2);
+
+  if (d1m && d2m) {
+    dm = fann_merge_train_data(d1m, d2m);
+
+    /* clean up a bit... if we don't free these up, then it's just a giant
+       memory leak */
+    fann_destroy_train(d1m);
+    fann_destroy_train(d2m);
+
+    return dm;
+  } else if (d1m) {
+    return d1m;
+  } else if (d2m) {
+    return d2m;
+  }
+
+  return NULL;
+}
+
+/* Function: fann_dump_train_data
+
+   Dumps *data* to the file *f*.  If file is null, stderr is used.
+ */
+FANN_EXTERNAL void FANN_API fann_dump_train_data(struct fann_train_data *data, FILE *f)
+{
+  int i, j;
+  if (f == NULL) {
+    f = stderr;
+  }
+
+  for (i = 0; i < data->num_data; i++) {
+    fprintf(f, "Num %d\n", i);
+    fprintf(f, "\ti: ");
+    for (j = 0; j < data->num_input; j++) {
+      fprintf(f, "%f ", data->input[i][j]);
+    }
+    fprintf(f, "\n\to: ");
+    for (j = 0; j < data->num_output; j++) {
+      fprintf(f, "%f ", data->output[i][j]);
+    }
+    fprintf(f, "\n");
+  }
+}
+
 FANN_EXTERNAL struct fann_train_data *FANN_API fann_subset_train_data(struct fann_train_data
                                                        *data, unsigned int pos,
                                                        unsigned int length)
Index: src/include/fann_train.h
===================================================================
RCS file: /cvsroot/fann/fann/src/include/fann_train.h,v
retrieving revision 1.26.4.2
diff -u -p -r1.26.4.2 fann_train.h
--- src/include/fann_train.h   31 Aug 2008 11:16:25 -0000   1.26.4.2
+++ src/include/fann_train.h   7 Oct 2008 20:57:43 -0000
@@ -585,6 +585,17 @@ FANN_EXTERNAL struct fann_train_data *FA
  */
 FANN_EXTERNAL struct fann_train_data *FANN_API fann_duplicate_train_data(struct fann_train_data
                                                        *data);
+/* Function: fann_merge_array_train_date

+   Returns <struct fann_train_data> that is the merge of all the elements in *data*
+ */
+FANN_EXTERNAL struct fann_train_data* FANN_API fann_merge_array_train_data(struct fann_train_data **data, int num_data);
+
+/* Function: fann_dump_train_data
+
+   Dumps *data* to the file *f*.  If file is null, stderr is used.
+ */
+FANN_EXTERNAL void FANN_API fann_dump_train_data(struct fann_train_data *data, FILE *f);
    
 /* Function: fann_subset_train_data
   


Top 
 Profile  
 
Display posts from previous:  Sort by  
 
Post new topic Reply to topic  [ 1 post ] 

Board index » FANN » Enhancing the C library


Who is online

Users browsing this forum: No registered users and 0 guests

 
 

 
You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot post attachments in this forum

Search for:
Jump to: