/* * uniqx ver.1.0 * date: 2005/02/03 * * uniqx deals with duplicated lines in a file * * Official uniq is buggy and too poor * * usage: * uniqx [-udcrf] [-k key,key,...] [file] * flags: * -u: print only unique line * -d: print only duplicated line * -c: add count of duplicated line in printing * -r: print last line of duplicated line * -f: field wise comparison (default is column wise comparison) * -k key,key,... * key specified the field to compare. * key is Python like range syntax. * n:m # the meaning is n,n+1,...,m-1 * :m # the meaning is 0:m * n: # the meaning is n to last * : # the meaning is 0 to last * n # the meaning is single field. * n begins with 0 * * supporting uniq syntax makes the code very dirty. * I gave up. * the relation is: * uniq +n * is same as * uniqx -k n: * and * uniq -n * is same as * uniqx -fk n: * * Kenar (Kenji Arisaw) * E-mail: arisawa@aichi-u.ac.jp */ #include #include #include #include #define gline() Brdstr(&in, '\n', 1) #define MFIELD 128 typedef struct Ranges Ranges; struct Ranges{ int n; int m; Ranges *next; }; Ranges *ranges; Biobuf in; Biobuf out; int count=0; int cflag=0; int uflag=0; int dflag=0; int rflag=0; int fflag=0; void pline(char *bp); int C_equal(Ranges *ranges, char *b1, char *b2); int F_equal(Ranges *ranges, char *b1, char *b2); int (*equal)(Ranges *ranges, char *b1, char *b2); void appendRange(Ranges **r, int n, int m) { Ranges *p, *q; q = nil; p = *r; while(p){ q = p; p = p->next; } // then p is nil p = malloc(sizeof(Ranges)); if(q) q ->next = p; else *r = p; p->n = n; p->m = m; p->next = nil; } void usage(void) { fprint(2,"usage:\n\ uniq [-udcrf] [-k key,key,...] [file]\n\ -udc flag is same as uniq.\n\ -r flag is `print last line of duplicated line'\n\ -f flag means field wise comparison.\n\ key follows Python syntax of range;\n\ that is, one of `n:m' `:m' `n:' `:'\n\ n begins with 0\n\ defaut is: uniq -k 0:\n"); exits("usage"); } void main(int argc, char *argv[]) { int fd,n,m; char *file; char *keys=nil; char *s,c; char *range[2]; int nrange; char *b1; char *b2; ARGBEGIN{ case 'c': cflag = 1; break; case 'd': dflag = 1; break; case 'f': fflag = 1; break; case 'k': keys=ARGF(); if(keys == nil) usage(); break; case 'r': rflag = 1; break; case 'u': uflag = 1; break; default: usage(); }ARGEND if(uflag && dflag) sysfatal("# incompatible flag -u and -d"); if(keys == nil) keys = "0:"; equal = C_equal; if(fflag) equal = F_equal; for(;;){ for(s = keys; *s && *s != ','; s++); c = *s; if(*s == ',') *s = 0; nrange = getfields(keys,range,2,0,":"); n = 0; m = 0; if(nrange == 1){ n = atoi(range[0]); m = n + 1; } if(nrange == 2){ n = atoi(range[0]); m = atoi(range[1]); } if(n < 0) sysfatal("# field out of range"); appendRange(&ranges, n,m); if(c == 0) break; keys = ++s; } file = *argv++; if(file && *argv != nil) usage(); fd = 0; if(file){ fd = open(file, OREAD); if(fd < 0) sysfatal("# cannot open %s: %r\n", *argv); } Binit(&in, fd, OREAD); Binit(&out, 1, OWRITE); if((b1 = gline()) == nil) exits(0); b1 = strdup(b1); count = 1; for(;;){ if((b2 = gline()) == nil) { pline(b1); exits(0); } if(!equal(ranges, b1, b2)){ pline(b1); free(b1); b1 = strdup(b2); count = 1; continue; } if(rflag){ free(b1); b1 = strdup(b2); } count++; } } void pline(char *bp) { if(uflag && count != 1) return; if(dflag && count == 1) return; if(cflag) Bprint(&out, "%4d ", count); Bprint(&out, "%s\n", bp); } int F_equal(Ranges *ranges, char *b1, char *b2) { char *tb1[MFIELD], *tb2[MFIELD]; int n1,n2,i,n,m,min,max; char *bb1; char *bb2; int status; bb1 = strdup(b1); bb2 = strdup(b2); n1 = tokenize(bb1, tb1, MFIELD); n2 = tokenize(bb2, tb2, MFIELD); min = (n1n; m = ranges->m; if(m == 0 || m > max) m = max; if(n < min){ for(i=n; i < m; i++){ if(strcmp(tb1[i], tb2[i]) != 0){ status = 0; goto L1; } } } else if(n < max){ status = 0; goto L1; } ranges = ranges->next; } status = 1; L1: free(bb1); free(bb2); return status; } int C_equal(Ranges *ranges, char *b1, char *b2) { int n1,n2,min,max, n, m; n1 = strlen(b1); n2 = strlen(b2); min = (n1n; m = ranges->m; if(m == 0 || m > max) m = max; if(n < min){ if(strncmp(b1+n,b2+n,m-n) != 0) return 0; } else if(n < max) return 0; ranges = ranges->next; } return 1; }