examples/SFExamples/oggvorbiscodec94/src/libvorbis/vq/latticepare.c

00001 /********************************************************************
00002  *                                                                  *
00003  * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
00004  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
00005  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
00006  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
00007  *                                                                  *
00008  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001             *
00009  * by the XIPHOPHORUS Company http://www.xiph.org/                  *
00010  *                                                                  *
00011  ********************************************************************
00012 
00013  function: utility for paring low hit count cells from lattice codebook
00014  last mod: $Id: latticepare.c 7187 2004-07-20 07:24:27Z xiphmont $
00015 
00016  ********************************************************************/
00017 
00018 #include <stdlib.h>
00019 #include <stdio.h>
00020 #include <math.h>
00021 #include <string.h>
00022 #include <errno.h>
00023 #include "../lib/scales.h"
00024 #include "bookutil.h"
00025 #include "vqgen.h"
00026 #include "vqsplit.h"
00027 #include "../lib/os.h"
00028 
00029 /* Lattice codebooks have two strengths: important fetaures that are
00030    poorly modelled by global error minimization training (eg, strong
00031    peaks) are not neglected 2) compact quantized representation.
00032 
00033    A fully populated lattice codebook, however, swings point 1 too far
00034    in the opposite direction; rare features need not be modelled quite
00035    so religiously and as such, we waste bits unless we eliminate the
00036    least common cells.  The codebook rep supports unused cells, so we
00037    need to tag such cells and build an auxiliary (non-thresh) search
00038    mechanism to find the proper match quickly */
00039 
00040 /* two basic steps; first is pare the cell for which dispersal creates
00041    the least additional error.  This will naturally choose
00042    low-population cells and cells that have not taken on points from
00043    neighboring paring (but does not result in the lattice collapsing
00044    inward and leaving low population ares totally unmodelled).  After
00045    paring has removed the desired number of cells, we need to build an
00046    auxiliary search for each culled point */
00047 
00048 /* Although lattice books (due to threshhold-based matching) do not
00049    actually use error to make cell selections (in fact, it need not
00050    bear any relation), the 'secondbest' entry finder here is in fact
00051    error/distance based, so latticepare is only useful on such books */
00052 
00053 /* command line:
00054    latticepare latticebook.vqh input_data.vqd <target_cells>
00055 
00056    produces a new output book on stdout 
00057 */
00058 
00059 static float _dist(int el,float *a, float *b){
00060   int i;
00061   float acc=0.f;
00062   for(i=0;i<el;i++){
00063     float val=(a[i]-b[i]);
00064     acc+=val*val;
00065   }
00066   return(acc);
00067 }
00068 
00069 static float *pointlist;
00070 static long points=0;
00071 
00072 void add_vector(codebook *b,float *vec,long n){
00073   int dim=b->dim,i,j;
00074   int step=n/dim;
00075   for(i=0;i<step;i++){
00076     for(j=i;j<n;j+=step){
00077       pointlist[points++]=vec[j];
00078     }
00079   }
00080 }
00081 
00082 static int bestm(codebook *b,float *vec){
00083   encode_aux_threshmatch *tt=b->c->thresh_tree;
00084   int dim=b->dim;
00085   int i,k,o;
00086   int best=0;
00087   
00088   /* what would be the closest match if the codebook was fully
00089      populated? */
00090   
00091   for(k=0,o=dim-1;k<dim;k++,o--){
00092     int i;
00093     for(i=0;i<tt->threshvals-1;i++)
00094       if(vec[o]<tt->quantthresh[i])break;
00095     best=(best*tt->quantvals)+tt->quantmap[i];
00096   }
00097   return(best);
00098 }
00099 
00100 static int closest(codebook *b,float *vec,int current){
00101   encode_aux_threshmatch *tt=b->c->thresh_tree;
00102   int dim=b->dim;
00103   int i,k,o;
00104 
00105   float bestmetric=0;
00106   int bestentry=-1;
00107   int best=bestm(b,vec);
00108 
00109   if(current<0 && b->c->lengthlist[best]>0)return best;
00110 
00111   for(i=0;i<b->entries;i++){
00112     if(b->c->lengthlist[i]>0 && i!=best && i!=current){
00113       float thismetric=_dist(dim, vec, b->valuelist+i*dim);
00114       if(bestentry==-1 || thismetric<bestmetric){
00115         bestentry=i;
00116         bestmetric=thismetric;
00117       }
00118     }
00119   }
00120 
00121   return(bestentry);
00122 }
00123 
00124 static float _heuristic(codebook *b,float *ppt,int secondbest){
00125   float *secondcell=b->valuelist+secondbest*b->dim;
00126   int best=bestm(b,ppt);
00127   float *firstcell=b->valuelist+best*b->dim;
00128   float error=_dist(b->dim,firstcell,secondcell);
00129   float *zero=alloca(b->dim*sizeof(float));
00130   float fromzero;
00131   
00132   memset(zero,0,b->dim*sizeof(float));
00133   fromzero=sqrt(_dist(b->dim,firstcell,zero));
00134 
00135   return(error/fromzero);
00136 }
00137 
00138 static int longsort(const void *a, const void *b){
00139   return **(long **)b-**(long **)a;
00140 }
00141 
00142 void usage(void){
00143   fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
00144           "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
00145           "where <target_cells> is the desired number of final cells (or -1\n"
00146           "                     for no change)\n"
00147           "      <protected_cells> is the number of highest-hit count cells\n"
00148           "                     to protect from dispersal\n"
00149           "      base is the base name (not including .vqh) of the new\n"
00150           "                     book\n\n");
00151   exit(1);
00152 }
00153 
00154 int main(int argc,char *argv[]){
00155   char *basename;
00156   codebook *b=NULL;
00157   int entries=0;
00158   int dim=0;
00159   long i,j,target=-1,protect=-1;
00160   FILE *out=NULL;
00161 
00162   int argnum=0;
00163 
00164   argv++;
00165   if(*argv==NULL){
00166     usage();
00167     exit(1);
00168   }
00169 
00170   while(*argv){
00171     if(*argv[0]=='-'){
00172 
00173       argv++;
00174         
00175     }else{
00176       switch (argnum++){
00177       case 0:case 1:
00178         {
00179           /* yes, this is evil.  However, it's very convenient to parse file
00180              extentions */
00181           
00182           /* input file.  What kind? */
00183           char *dot;
00184           char *ext=NULL;
00185           char *name=strdup(*argv++);
00186           dot=strrchr(name,'.');
00187           if(dot)
00188             ext=dot+1;
00189           else{
00190             ext="";
00191             
00192           }
00193           
00194           
00195           /* codebook */
00196           if(!strcmp(ext,"vqh")){
00197             
00198             basename=strrchr(name,'/');
00199             if(basename)
00200               basename=strdup(basename)+1;
00201             else
00202               basename=strdup(name);
00203             dot=strrchr(basename,'.');
00204             if(dot)*dot='\0';
00205             
00206             b=codebook_load(name);
00207             dim=b->dim;
00208             entries=b->entries;
00209           }
00210           
00211           /* data file; we do actually need to suck it into memory */
00212           /* we're dealing with just one book, so we can de-interleave */ 
00213           if(!strcmp(ext,"vqd") && !points){
00214             int cols;
00215             long lines=0;
00216             char *line;
00217             float *vec;
00218             FILE *in=fopen(name,"r");
00219             if(!in){
00220               fprintf(stderr,"Could not open input file %s\n",name);
00221               exit(1);
00222             }
00223             
00224             reset_next_value();
00225             line=setup_line(in);
00226             /* count cols before we start reading */
00227             {
00228               char *temp=line;
00229               while(*temp==' ')temp++;
00230               for(cols=0;*temp;cols++){
00231                 while(*temp>32)temp++;
00232                 while(*temp==' ')temp++;
00233               }
00234             }
00235             vec=alloca(cols*sizeof(float));
00236             /* count, then load, to avoid fragmenting the hell out of
00237                memory */
00238             while(line){
00239               lines++;
00240               for(j=0;j<cols;j++)
00241                 if(get_line_value(in,vec+j)){
00242                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
00243                   exit(1);
00244                 }
00245               if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
00246               line=setup_line(in);
00247             }
00248             pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float));
00249             
00250             rewind(in);
00251             line=setup_line(in);
00252             while(line){
00253               lines--;
00254               for(j=0;j<cols;j++)
00255                 if(get_line_value(in,vec+j)){
00256                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
00257                   exit(1);
00258                 }
00259               /* deinterleave, add to heap */
00260               add_vector(b,vec,cols);
00261               if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
00262               
00263               line=setup_line(in);
00264             }
00265             fclose(in);
00266           }
00267         }
00268         break;
00269       case 2:
00270         target=atol(*argv++);
00271         if(target==0)target=entries;
00272         break;
00273       case 3:
00274         protect=atol(*argv++);
00275         break;
00276       case 4:
00277         {
00278           char *buff=alloca(strlen(*argv)+5);
00279           sprintf(buff,"%s.vqh",*argv);
00280           basename=*argv++;
00281 
00282           out=fopen(buff,"w");
00283           if(!out){
00284             fprintf(stderr,"unable ot open %s for output",buff);
00285             exit(1);
00286           }
00287         }
00288         break;
00289       default:
00290         usage();
00291       }
00292     }
00293   }
00294   if(!entries || !points || !out)usage();
00295   if(target==-1)usage();
00296 
00297   /* add guard points */
00298   for(i=0;i<entries;i++)
00299     for(j=0;j<dim;j++)
00300       pointlist[points++]=b->valuelist[i*dim+j];
00301   
00302   points/=dim;
00303 
00304   /* set up auxiliary vectors for error tracking */
00305   {
00306     encode_aux_nearestmatch *nt=NULL;
00307     long pointssofar=0;
00308     long *pointindex;
00309     long indexedpoints=0;
00310     long *entryindex;
00311     long *reventry;
00312     long *membership=_ogg_malloc(points*sizeof(long));
00313     long *firsthead=_ogg_malloc(entries*sizeof(long));
00314     long *secondary=_ogg_malloc(points*sizeof(long));
00315     long *secondhead=_ogg_malloc(entries*sizeof(long));
00316 
00317     long *cellcount=_ogg_calloc(entries,sizeof(long));
00318     long *cellcount2=_ogg_calloc(entries,sizeof(long));
00319     float *cellerror=_ogg_calloc(entries,sizeof(float));
00320     float *cellerrormax=_ogg_calloc(entries,sizeof(float));
00321     long cellsleft=entries;
00322     for(i=0;i<points;i++)membership[i]=-1;
00323     for(i=0;i<entries;i++)firsthead[i]=-1;
00324     for(i=0;i<points;i++)secondary[i]=-1;
00325     for(i=0;i<entries;i++)secondhead[i]=-1;
00326 
00327     for(i=0;i<points;i++){
00328       /* assign vectors to the nearest cell.  Also keep track of second
00329          nearest for error statistics */
00330       float *ppt=pointlist+i*dim;
00331       int    firstentry=closest(b,ppt,-1);
00332       int    secondentry=closest(b,ppt,firstentry);
00333       float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
00334       float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
00335       
00336       if(!(i&0xff))spinnit("initializing... ",points-i);
00337     
00338       membership[i]=firsthead[firstentry];
00339       firsthead[firstentry]=i;
00340       secondary[i]=secondhead[secondentry];
00341       secondhead[secondentry]=i;
00342 
00343       if(i<points-entries){
00344         cellerror[firstentry]+=secondmetric-firstmetric;
00345         cellerrormax[firstentry]=max(cellerrormax[firstentry],
00346                                      _heuristic(b,ppt,secondentry));
00347         cellcount[firstentry]++;
00348         cellcount2[secondentry]++;
00349       }
00350     }
00351 
00352     /* which cells are most heavily populated?  Protect as many from
00353        dispersal as the user has requested */
00354     {
00355       long **countindex=_ogg_calloc(entries,sizeof(long *));
00356       for(i=0;i<entries;i++)countindex[i]=cellcount+i;
00357       qsort(countindex,entries,sizeof(long *),longsort);
00358       for(i=0;i<protect;i++){
00359         int ptr=countindex[i]-cellcount;
00360         cellerrormax[ptr]=9e50f;
00361       }
00362     }
00363 
00364     {
00365       fprintf(stderr,"\r");
00366       for(i=0;i<entries;i++){
00367         /* decompose index */
00368         int entry=i;
00369         for(j=0;j<dim;j++){
00370           fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
00371           entry/=b->c->thresh_tree->quantvals;
00372         }
00373         
00374         fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
00375       }
00376       fprintf(stderr,"\n");
00377     }
00378 
00379     /* do the automatic cull request */
00380     while(cellsleft>target){
00381       int bestcell=-1;
00382       float besterror=0;
00383       float besterror2=0;
00384       long head=-1;
00385       char spinbuf[80];
00386       sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
00387 
00388       /* find the cell with lowest removal impact */
00389       for(i=0;i<entries;i++){
00390         if(b->c->lengthlist[i]>0){
00391           if(bestcell==-1 || cellerrormax[i]<=besterror2){
00392             if(bestcell==-1 || cellerrormax[i]<besterror2 || 
00393                besterror>cellerror[i]){
00394               besterror=cellerror[i];
00395               besterror2=cellerrormax[i];
00396               bestcell=i;
00397             }
00398           }
00399         }
00400       }
00401 
00402       fprintf(stderr,"\reliminating cell %d                              \n"
00403               "     dispersal error of %g max/%g total (%ld hits)\n",
00404               bestcell,besterror2,besterror,cellcount[bestcell]);
00405 
00406       /* disperse it.  move each point out, adding it (properly) to
00407          the second best */
00408       b->c->lengthlist[bestcell]=0;
00409       head=firsthead[bestcell];
00410       firsthead[bestcell]=-1;
00411       while(head!=-1){
00412         /* head is a point number */
00413         float *ppt=pointlist+head*dim;
00414         int firstentry=closest(b,ppt,-1);
00415         int secondentry=closest(b,ppt,firstentry);
00416         float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
00417         float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
00418         long next=membership[head];
00419 
00420         if(head<points-entries){
00421           cellcount[firstentry]++;
00422           cellcount[bestcell]--;
00423           cellerror[firstentry]+=secondmetric-firstmetric;
00424           cellerrormax[firstentry]=max(cellerrormax[firstentry],
00425                                        _heuristic(b,ppt,secondentry));
00426         }
00427 
00428         membership[head]=firsthead[firstentry];
00429         firsthead[firstentry]=head;
00430         head=next;
00431         if(cellcount[bestcell]%128==0)
00432           spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
00433 
00434       }
00435 
00436       /* now see that all points that had the dispersed cell as second
00437          choice have second choice reassigned */
00438       head=secondhead[bestcell];
00439       secondhead[bestcell]=-1;
00440       while(head!=-1){
00441         float *ppt=pointlist+head*dim;
00442         /* who are we assigned to now? */
00443         int firstentry=closest(b,ppt,-1);
00444         /* what is the new second closest match? */
00445         int secondentry=closest(b,ppt,firstentry);
00446         /* old second closest is the cell being disbanded */
00447         float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
00448         /* new second closest error */
00449         float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
00450         long next=secondary[head];
00451 
00452         if(head<points-entries){
00453           cellcount2[secondentry]++;
00454           cellcount2[bestcell]--;
00455           cellerror[firstentry]+=secondmetric-oldsecondmetric;
00456           cellerrormax[firstentry]=max(cellerrormax[firstentry],
00457                                        _heuristic(b,ppt,secondentry));
00458         }
00459         
00460         secondary[head]=secondhead[secondentry];
00461         secondhead[secondentry]=head;
00462         head=next;
00463 
00464         if(cellcount2[bestcell]%128==0)
00465           spinnit(spinbuf,cellcount2[bestcell]);
00466       }
00467 
00468       cellsleft--;
00469     }
00470 
00471     /* paring is over.  Build decision trees using points that now fall
00472        through the thresh matcher. */
00473     /* we don't free membership; we flatten it in order to use in lp_split */
00474 
00475     for(i=0;i<entries;i++){
00476       long head=firsthead[i];
00477       spinnit("rearranging membership cache... ",entries-i);
00478       while(head!=-1){
00479         long next=membership[head];
00480         membership[head]=i;
00481         head=next;
00482       }
00483     }
00484 
00485     free(secondhead);
00486     free(firsthead);
00487     free(cellerror);
00488     free(cellerrormax);
00489     free(secondary);
00490 
00491     pointindex=_ogg_malloc(points*sizeof(long));
00492     /* make a point index of fall-through points */
00493     for(i=0;i<points;i++){
00494       int best=_best(b,pointlist+i*dim,1);
00495       if(best==-1)
00496         pointindex[indexedpoints++]=i;
00497       spinnit("finding orphaned points... ",points-i);
00498     }
00499 
00500     /* make an entry index */
00501     entryindex=_ogg_malloc(entries*sizeof(long));
00502     target=0;
00503     for(i=0;i<entries;i++){
00504       if(b->c->lengthlist[i]>0)
00505         entryindex[target++]=i;
00506     }
00507 
00508     /* make working space for a reverse entry index */
00509     reventry=_ogg_malloc(entries*sizeof(long));
00510 
00511     /* do the split */
00512     nt=b->c->nearest_tree=
00513       _ogg_calloc(1,sizeof(encode_aux_nearestmatch));
00514 
00515     nt->alloc=4096;
00516     nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc);
00517     nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc);
00518     nt->p=_ogg_malloc(sizeof(long)*nt->alloc);
00519     nt->q=_ogg_malloc(sizeof(long)*nt->alloc);
00520     nt->aux=0;
00521 
00522     fprintf(stderr,"Leaves added: %d              \n",
00523             lp_split(pointlist,points,
00524                      b,entryindex,target,
00525                      pointindex,indexedpoints,
00526                      membership,reventry,
00527                      0,&pointssofar));
00528     free(membership);
00529     free(reventry);
00530     free(pointindex);
00531 
00532     /* hack alert.  I should just change the damned splitter and
00533        codebook writer */
00534     for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
00535     for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
00536     
00537     /* recount hits.  Build new lengthlist. reuse entryindex storage */
00538     for(i=0;i<entries;i++)entryindex[i]=1;
00539     for(i=0;i<points-entries;i++){
00540       int best=_best(b,pointlist+i*dim,1);
00541       float *a=pointlist+i*dim;
00542       if(!(i&0xff))spinnit("counting hits...",i);
00543       if(best==-1){
00544         fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
00545                 "codebook entry.  The new decision tree is broken.\n");
00546         exit(1);
00547       }
00548       entryindex[best]++;
00549     }
00550     for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
00551     for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
00552     
00553     /* the lengthlist builder doesn't actually deal with 0 hit entries.
00554        So, we pack the 'sparse' hit list into a dense list, then unpack
00555        the lengths after the build */
00556     {
00557       int upper=0;
00558       long *lengthlist=_ogg_calloc(entries,sizeof(long));
00559       for(i=0;i<entries;i++){
00560         if(b->c->lengthlist[i]>0)
00561           entryindex[upper++]=entryindex[i];
00562         else{
00563           if(entryindex[i]>1){
00564             fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
00565             exit(1);
00566           }
00567         }
00568       }
00569       
00570       /* sanity check */
00571       if(upper != target){
00572         fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
00573         exit(1);
00574       }
00575     
00576       build_tree_from_lengths(upper,entryindex,lengthlist);
00577       
00578       upper=0;
00579       for(i=0;i<entries;i++){
00580         if(b->c->lengthlist[i]>0)
00581           b->c->lengthlist[i]=lengthlist[upper++];
00582       }
00583 
00584     }
00585   }
00586   /* we're done.  write it out. */
00587   write_codebook(out,basename,b->c);
00588 
00589   fprintf(stderr,"\r                                        \nDone.\n");
00590   return(0);
00591 }
00592 
00593 
00594 
00595 

Generated by  doxygen 1.6.2