/********************************************** localsearch.cxx Copyright 2001 by Adam Meyerson and Liadan O'Callaghan This code is provided for examination by anyone who may be interested in implementing local search clustering algorithms; it is not intended to be run, but to be used as a starting point for anyone who wants to write his or her own implementation. In particular: We, the authors, do not provide a license for the use of this code. If you use this code instead of writing your own, we don't want to know. We do not claim that it works, and will not maintain it or provide support to those who use it. Use of this code is at the risk of the user. ***** Paper reference: Streaming-Data Algorithms for High-Quality Clustering, by Liadan O'Callaghan, Nina Mishra, Adam Meyerson, Sudipto Guha, and Rajeev Motwani, ICDE 2002. ***** This local search code computes a continuous k-median clustering of a given point set, where k is within a user-specified range. To execute: localsearch k1 k2 n d infile samplesize outfile k1 and k2 are lower and upper bounds on the number of medians you want to find. n is the number of points in infile, the file of points to cluster. d is the dimensionality of the points in infile, which will be taken as points in real space with the L2 norm. If samplesize is set to some s > 0, a sample of s points will be taken uniformly at random with replacement from infile, and the clustering will be computed just on this sample. Setting samplesize to 0 or less will result in a clustering based on all n points. infile must contain one point per line, with each point given in the format: w x1 x2 x3 ... xd where w is the weight of the point and its d coordinates are x1 through xd. outfile will be created if it does not exist, and overwritten otherwise. It will contain the medians found by localsearch, and it will be in the same format as infile: the weight of a median will correspond to the total weight of all points assigned to it. There must be a file called 'seeds' in the directory in which localsearch is running; this file must contain an integer. localsearch will use this integer to seed its random number generator, and will replace it with a new integer each time it is run. ***********************************/ #include #include #include #include #include #include #include #include #include #include #define MAXNAMESIZE 1024 // max filename length #define SEED 1 /* increase this to reduce probability of random error */ /* increasing it also ups running time of "speedy" part of the code */ /* SP = 1 seems to be fine */ #define SP 1 // number of repetitions of speedy must be >=1 /* higher ITER --> more likely to get correct # of centers */ /* higher ITER also scales the running time almost linearly */ #define ITER 3 // iterate ITER* k log k times; ITER >= 1 /* this structure represents a point */ /* these will be passed around to avoid copying coordinates */ typedef struct { float weight; float *coord; long assign; /* number of point where this one is assigned */ float cost; /* cost of that assignment, weight*distance */ } Point; /* this is the array of points */ typedef struct { long num; /* number of points; may not be N if this is a sample */ int dim; /* dimensionality */ Point *p; /* the array itself */ } Points; void sample(Points *points, Points *sampl, long size); void sample2(Points *points, long size); int isIdentical(float *i, float *j, int D) // tells whether two points of D dimensions are identical { int a = 0; int equal = 1; while (equal && a < D) { if (i[a] != j[a]) equal = 0; else a++; } if (equal) return 1; else return 0; } void getSeed(int *s) { /* read in random seed from file called "seeds" */ FILE *fpSeeds; fpSeeds = fopen("seeds", "r"); if (fpSeeds == NULL) { cerr << "CREATE A FILE CALLED seeds CONTAINING AN INTEGER!\n"; exit(1); } fscanf(fpSeeds, "%d", s); fclose(fpSeeds); // cout << "Using Random Seed: " << *s << endl; srand48(*s); fpSeeds = fopen("seeds", "w"); fprintf(fpSeeds, "%d\n", lrand48()); fclose(fpSeeds); } /* read the points from filename into the array points */ void readpoints(char *filename, Points *points) { long i, ii; ifstream ifile; ifile.open(filename); points->p = (Point *)malloc(points->num*sizeof(Point)); for(i=0;inum;i++) { points->p[i].coord = (float *)malloc(points->dim*sizeof(float)); ifile >> points->p[i].weight; for (ii=0;iidim;ii++) ifile >> points->p[i].coord[ii]; } } /* read the command line to get kmin, kmax, point array */ int readcommands(int argc, char **argv, long *kmin, long *kmax, Points *points, Points *sampl, long *samplesize, char **outfile) { long i; if (argc<8) { cerr << "usage: " << argv[0] << " k1 k2 n d infile samplesize outfile" << endl; cerr << "where n = # of points, d = # of dimensions;" << endl; cerr << "(note k1 < k2)" << endl; cerr << "Sampling is with replacement whenever samplesize > 0." << endl; cerr << "If samplesize <= 0, algorithm is run on whole dataset." << endl; return(0); } *kmin = atoi(argv[1]); *kmax = atoi(argv[2]); strcpy(*outfile, argv[7]); points->num = atoi(argv[3]); points->dim = atoi(argv[4]); readpoints(argv[5], points); *samplesize = atoi(argv[6]); if (*samplesize > 0) { /* this creates the Points set sampl, frees the memory for points */ sample2(points, *samplesize); } return(1); } /* build a sample of given size, points drawn uniformly at random */ /* with replacement */ void sample(Points *points, Points *sampl, long size) { long i, j; long randpoint; sampl->num = size; sampl->dim = points->dim; sampl->p = (Point *)malloc(sizeof(Point)*size); for( i = 0; i < size; i++ ) { sampl->p[i].coord = (float *)malloc(sampl->dim * sizeof(float)); } for ( i = 0; i < size; i++ ) { // pick a point from points, uniformly at random randpoint = lrand48() % points->num; // copy that point into the ith location in the sample for ( j = 0; j < sampl->dim; j++ ) { sampl->p[i].coord[j] = points->p[randpoint].coord[j]; } sampl->p[i].weight = points->p[randpoint].weight; } for ( i = 0; i < points->num; i++ ) { free(points->p[i].coord); } free(points->p); points->p = sampl->p; points->num = sampl->num; } /* comparator for floating point numbers */ static int floatcomp(const void *i, const void *j) { float a, b; a = *(float *)(i); b = *(float *)(j); if (a > b) return (1); if (a < b) return (-1); return(0); } /* select a sample with total WEIGHT equal to size */ /* this is sampling of size points with replacement */ /* multiple selections increase the effective weight of a point */ /* thus, the total number of points in the sample is <= size */ /* this procedure will free old points structure while running */ void sample2(Points *points, long size) { long i, ii, iii; float totalweight; float *weightarray; float mytotal; Points *sampl; sampl = (Points *)malloc(sizeof(Points)); totalweight=0.0; weightarray = (float *)malloc(sizeof(float)*size); sampl->p=(Point *)malloc(sizeof(Point)*size); for (i=0;inum;i++) totalweight += points->p[i].weight; /* determine random weights between 0 and totalweight */ for (i=0;i totalweight) fprintf(stderr, "Random Number Error!\n"); } /* sort these weights from smallest to largest */ qsort(weightarray, size, sizeof(float), floatcomp); mytotal=0.0; ii=0; iii=0; /* look through all the points, deciding how many times to sample them */ for (i=0;inum;i++) { mytotal += points->p[i].weight; /* check whether current point was sampled */ if ((ii weightarray[ii])) { sampl->p[iii].weight = 1.0; sampl->p[iii].coord = points->p[i].coord; ii++; /* check if this point was chosen again due to high weight */ while ((ii < size)&&(mytotal > weightarray[ii])) { sampl->p[iii].weight += 1.0; ii++; } iii++; } /* this point will not appear in the sample */ else { free(points->p[i].coord); } } sampl->num = iii; sampl->dim = points->dim; free(points->p); /* copy sampl over to points */ points->num = sampl->num; points->dim = sampl->dim; points->p = sampl->p; } /* shuffle points into random order */ void shuffle(Points *points) { long i, j; Point temp; for (i=0;inum-1;i++) { j=(lrand48()%(points->num - i)) + i; temp = points->p[i]; points->p[i] = points->p[j]; points->p[j] = temp; } } /* shuffle an array of integers */ void intshuffle(int *intarray, int length) { long i, j; int temp; for (i=0;inum); /* create center at first point, send it to itself */ centers[0] = 0; *k=1; points->p[0].assign=0; points->p[0].cost = 0.0; totalcost = z; for (i=1;inum;i++) { /* find closest center */ closest=0; mindist=dist(points->p[i],points->p[0],points->dim); for (ii=1;ii<*k;ii++) { distance=dist(points->p[i], points->p[centers[ii]],points->dim); if (distance < mindist) { mindist=distance; closest=ii; } } /* decide whether to open new facility */ if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) { centers[(*k)++] = i; points->p[i].assign=i; points->p[i].cost = 0.0; totalcost+=z; } else { points->p[i].assign=centers[closest]; points->p[i].cost = mindist*points->p[i].weight; totalcost += points->p[i].cost; } } #ifdef PRINTINFO fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n", *k, totalcost); fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k)); #endif free(centers); return(totalcost); } /* run speedy on the points, return total cost of solution */ float speedy2(Points *points, float z, long *k, long klim) { long i, ii, closest; long *centers; float mindist, distance; float totalcost; #ifdef PRINTINFO fprintf(stderr, "Speedy: facility cost %lf\n", z); #endif centers=(long *)malloc(sizeof(long)*points->num); /* create center at first point, send it to itself */ centers[0] = 0; *k=1; points->p[0].assign=0; points->p[0].cost = 0.0; totalcost = z; for (i=1;inum;i++) { /* find closest center */ closest=0; mindist=dist(points->p[i],points->p[0],points->dim); for (ii=1;ii<*k;ii++) { distance=dist(points->p[i], points->p[centers[ii]],points->dim); if (distance < mindist) { mindist=distance; closest=ii; } } /* decide whether to open new facility */ if (((float)lrand48()/(float)INT_MAX)<(mindist*points->p[i].weight/z)) { centers[(*k)++] = i; points->p[i].assign=i; points->p[i].cost = 0.0; totalcost+=z; } else { points->p[i].assign=centers[closest]; points->p[i].cost = mindist*points->p[i].weight; totalcost += points->p[i].cost; } if (*k > klim) { free(centers); return(totalcost); } } #ifdef PRINTINFO fprintf(stderr, "Speedy opened %d facilities for total cost %lf\n", *k, totalcost); fprintf(stderr, "Distance Cost %lf\n", totalcost - z*(*k)); #endif free(centers); return(totalcost); } /* For a given point x, find the cost of the following operation: * -- open a facility at x if there isn't already one there, * -- for points y such that the assignment distance of y exceeds dist(y, x), * make y a member of x, * -- for facilities y such that reassigning y and all its members to x * would save cost, realize this closing and reassignment. * * If the cost of this operation is negative (i.e., if this entire operation * saves cost), perform this operation and return the amount of cost saved; * otherwise, do nothing. */ /* numcenters will be updated to reflect the new number of centers */ /* z is the facility cost, x is the number of this point in the array points */ float gain(long x, Points *points, float z, long *numcenters) { int i; int number_of_points = points->num; int number_of_centers_to_close = 0; // for each point y, will y switch membership to x? int *switch_membership = (int *) malloc(sizeof(int) * number_of_points); for ( i = 0; i < number_of_points; i++ ) { switch_membership[i] = 0; } // for each point y, if y is a median, how much will it save by closing // and shifting itself and its members to x? float *lower = (float *) malloc(sizeof(int) * number_of_points); for ( i = 0; i < number_of_points; i++ ) { lower[i] = 0; } // We will calculate the cost of opening a median at x float cost_of_opening_x; if ( points->p[x].assign == x ) { // x is already a median; opening it doesn't cost an additional z cost_of_opening_x = 0; } else { cost_of_opening_x = z; } // every median would save ("lower" its cost by) z if it could close // and reassign itself and its members to x for ( i = 0; i < number_of_points; i++ ) { lower[points->p[i].assign] = z; } for ( i = 0; i < number_of_points; i++ ) { float x_cost = dist(points->p[i], points->p[x], points->dim) * points->p[i].weight; float current_cost = dist(points->p[i], points->p[points->p[i].assign], points->dim) * points->p[i].weight; if ( x_cost < current_cost ) { // point i would save cost just by switching to x // (note that i cannot be a median, // or else dist(p[i], p[x]) would be 0) switch_membership[i] = 1; cost_of_opening_x += x_cost - current_cost; } else { // cost of assigning i to x is at least current assignment cost of i // consider the savings that i's **current** median would realize // if we reassigned that median and all its members to x; // note we've already accounted for the fact that the median // would save z by closing; now we have to subtract from the savings // the extra cost of reassigning that median and its members lower[points->p[i].assign] += current_cost - x_cost; } } // at this time, we can calculate the cost of opening a center // at x; if it is negative, we'll go through with opening it for ( i = 0; i < number_of_points; i++ ) { if ( lower[i] > 0 ) { // i is a median, and // if we were to open x (which we still may not) we'd close i // note, we'll ignore the following quantity unless we do open x ++number_of_centers_to_close; cost_of_opening_x -= lower[i]; } } // now, check whether opening x would save cost; if so, do it, // and otherwise do nothing if ( cost_of_opening_x < 0 ) { // we'd save money by opening x; we'll do it for ( i = 0; i < number_of_points; i++ ) { if ( switch_membership[i] || lower[points->p[i].assign] > 0 ) { // Either i's median (which may be i itself) is closing, // or i is closer to x than to its current median points->p[i].cost = points->p[i].weight * dist(points->p[i], points->p[x], points->dim); points->p[i].assign = x; } } *numcenters = *numcenters + 1 - number_of_centers_to_close; } else { cost_of_opening_x = 0; // the value we'll return } free(switch_membership); free(lower); return -cost_of_opening_x; } /* facility location on the points using local search */ /* z is the facility cost, returns the total cost and # of centers */ /* assumes we are seeded with a reasonable solution */ /* cost should represent this solution's cost */ /* halt if there is < e improvement after iter calls to gain */ /* feasible is an array of numfeasible points which may be centers */ float FL(Points *points, int *feasible, int numfeasible, float z, long *k, double cost, long iter, float e) { long i; long x; double change; long numberOfPoints; #ifdef PRINTINFO fprintf(stderr, "%d centers, cost %lf, total distance %lf\n", *k, cost, cost - z*(*k)); #endif change = -cost; /* continue until we run iter iterations without improvement */ /* stop instead if improvement is less than e */ while (change/cost < -1.0*e) { change = 0.0; numberOfPoints = points->num; /* randomize order in which centers are considered */ intshuffle(feasible, numfeasible); for (i=0;i= numfeasible) x = i%numfeasible; else x = lrand48()%numfeasible; change += gain(feasible[x], points, z, k); } cost += change; #ifdef PRINTINFO fprintf(stderr, "%d centers, cost %lf, total distance %lf\n", *k, cost, cost - z*(*k)); #endif } return(cost); } /* set feasible to be an array of feasible centers */ /* return number of feasible centers */ int selectfeasible(Points *points, int **feasible, int kmin) { int i, ii; int numfeasible; float *feasweight; float totalweight; float mytotal; numfeasible = points->num; if (numfeasible > (ITER*kmin*log(kmin))) numfeasible = (int)(ITER*kmin*log(kmin)); *feasible = (int *)malloc(numfeasible*sizeof(int)); /* not many points, all will be feasible */ if (numfeasible == points->num) { for (i=0;inum;i++) (*feasible)[i] = i; } else { /* choose feasible points at random */ totalweight = 0.0; for (i=0;inum;i++) totalweight+=points->p[i].weight; feasweight = (float *)malloc(numfeasible*sizeof(float)); for (i=0;inum;i++) { mytotal+=points->p[i].weight; while ((ii < numfeasible)&&(mytotal > feasweight[ii])) (*feasible)[ii++] = i; } if (ii != numfeasible) fprintf(stderr, "Error in Feasible Centers\n"); free(feasweight); } return(numfeasible); } /* compute approximate kmedian on the points */ float kmedian(Points *points, long kmin, long kmax) { int i; float cost; float lastcost; float hiz, loz, z; long k; int flag; int *feasible; int numfeasible; hiz = loz = 0.0; long numberOfPoints = points->num; long ptDimension = points->dim; #ifdef PRINTINFO fprintf(stderr, "Starting Kmedian procedure\n"); fprintf(stderr, "%d points in %d dimensions\n", numberOfPoints, ptDimension); #endif for (i=0;ip[i], points->p[0], ptDimension)*points->p[i].weight; loz=0.0; z = (hiz+loz)/2.0; shuffle(points); cost = speedy2(points, 0.001, &k, kmax); if (k <= kmax) { return(cost); } shuffle(points); cost = speedy(points, z, &k); #ifdef PRINTINFO fprintf(stderr, "Finished first call to speedy\n"); #endif i=0; /* NEW: Check whether more centers than points! */ if (points->num <= kmax) { /* just return all points as facilities */ for (i=0;inum;i++) { points->p[i].assign = i; points->p[i].cost = 0; return(0.0); } } /* give speedy SP chances to get at least kmin/2 facilities */ while ((k < kmin)&&(i= SP) {hiz=z; z=(hiz+loz)/2.0; i=0;} shuffle(points); cost = speedy(points, z, &k); i++; } /* now we begin the binary search for real */ /* must designate some points as feasible centers */ /* this creates more consistancy between FL runs */ /* helps to guarantee correct # of centers at the end */ numfeasible = selectfeasible(points, &feasible, kmin); while(1) { #ifdef PRINTINFO fprintf(stderr, "loz = %lf, hiz = %lf\n", loz, hiz); fprintf(stderr, "Running Local Search...\n"); #endif /* first get a rough estimate on the FL solution */ lastcost = cost; cost = FL(points, feasible, numfeasible, z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.1); /* if number of centers seems good, try a more accurate FL */ if (((k <= (1.1)*kmax)&&(k >= (0.9)*kmin))|| ((k <= kmax+2)&&(k >= kmin-2))) { #ifdef PRINTINFO fprintf(stderr, "Trying a more accurate local search...\n"); #endif /* may need to run a little longer here before halting without improvement */ cost = FL(points, feasible, numfeasible, z, &k, cost, (long)(ITER*kmax*log(kmax)), 0.001); } if (k > kmax) { /* facilities too cheap */ /* increase facility cost and up the cost accordingly */ loz = z; z = (hiz+loz)/2.0; cost += (z-loz)*k; } if (k < kmin) { /* facilities too expensive */ /* decrease facility cost and reduce the cost accordingly */ hiz = z; z = (hiz+loz)/2.0; cost += (z-hiz)*k; } /* if k is good, return the result */ /* if we're stuck, just give up and return what we have */ if (((k <= kmax)&&(k >= kmin))||((loz >= (0.999)*hiz))) { free(feasible); return(cost);} } } /* compute the means for the k clusters */ float contcenters(Points *points) { long i, ii; float relweight; float cost; for (i=0;inum;i++) { /* compute relative weight of this point to the cluster */ if (points->p[i].assign != i) { relweight=points->p[points->p[i].assign].weight + points->p[i].weight; relweight = points->p[i].weight/relweight; for (ii=0;iidim;ii++) { points->p[points->p[i].assign].coord[ii]*=1.0-relweight; points->p[points->p[i].assign].coord[ii]+= points->p[i].coord[ii]*relweight; } points->p[points->p[i].assign].weight += points->p[i].weight; } } cost = 0.0; for (i=0;inum;i++) cost += dist(points->p[i], points->p[points->p[i].assign], points->dim)*points->p[i].weight; return(cost); } /* print the centers to a file of a given name */ void outcenters(Points *points, char *outname) { long i, ii; long k; int *is_a_median = (int *) malloc(points->num * sizeof(int)); for ( i = 0; i < points->num; i++) { is_a_median[i] = 0; } /* mark the centers */ for ( i = 0; i < points->num; i++ ) { is_a_median[points->p[i].assign] = 1; } k=0; /* count how many centers */ for ( i = 0; i < points->num; i++ ) { if ( is_a_median[i] ) { k++; } } FILE *fpOut = fopen(outname, "w"); /* output the centers */ for ( i = 0; i < points->num; i++ ) { if ( is_a_median[i] ) { fprintf(fpOut, "%d", (int)points->p[i].weight); for ( ii = 0; ii < points->dim; ii++ ) { fprintf(fpOut, " %lf", points->p[i].coord[ii]); } fprintf(fpOut, "\n"); } } fclose(fpOut); free(is_a_median); } main(int argc, char **argv) { char *outfilename = new char[MAXNAMESIZE]; long kmin, kmax, k, dimension, N, samplesize; Points points, sampl; // Note: 00.12.11: (Liadan) Note that I've changed command line: // It's now a.out k1 k2 N d inputfile samplesize int seed; if (SEED) getSeed(&seed); if ( readcommands(argc, argv, &kmin, &kmax, &points, &sampl, &samplesize, &outfilename) == 0 ) { exit(0); } dimension = points.dim; kmedian(&points, kmin, kmax); contcenters(&points); outcenters(&points, outfilename); }