Xref: utzoo news.software.b:5480 alt.sources:2171 Path: utzoo!utstat!news-server.csri.toronto.edu!cs.utexas.edu!yale!mintaka!olivea!olivey!jerry From: jerry@olivey.olivetti.com (Jerry Aguirre) Newsgroups: news.software.b,alt.sources Subject: Tool to find duplicate articles Keywords: news expire rebuild duplicates Message-ID: <49290@olivea.atc.olivetti.com> Date: 16 Aug 90 19:06:16 GMT Sender: news@olivea.atc.olivetti.com Followup-To: news.software.b Lines: 129 Here is a tool I thru together when my news history got corrupted and users started complaining about seeing duplicates of articles. ===BEGIN histdups.c=== #include #define LINESIZ 1024 #define MAXF 32 /* Expects the stdin to be the history file, sorted. Stdout is a list * of file names which are duplicates of earlier articles. Run after * expire -r and then "rm" the files listed in the output. * * sort dupfiles; xargs */ char files[MAXF][LINESIZ]; int nf; long atol(); char *index(); main() { char c, *p; int i, j; char line[LINESIZ]; char id[LINESIZ]; char lastline[LINESIZ]; nf = 0; id[0] = '\0'; lastline[0] = '\0'; while (gets(line)) { p = index(line, '\t'); if (p) { *p = '\0'; if (strcmp(line, id) == 0) { /* we have a dup */ if (lastline[0] != '\0') { parsefiles(lastline); lastline[0] = '\0'; } *p = '\t'; parsefiles(line); } else { printdups(); strcpy(id, line); *p = '\t'; strcpy(lastline, line); nf = 0; } } } } parsefiles(line) char *line; { char *pd, *pf, *p; pd = index(line, '\t'); if (pd) pd++; else return; pf = index(pd, '\t'); if (pf) pf++; else return; while (*pf) { while (*pf == ' ') pf++; if (*pf == '\0') return; if (nf >= MAXF) return; p = index(pf, ' '); if (p) *p = '\0'; strcpy(files[nf], pf); nf++; if (p) { pf = p + 1; *p = ' '; } else return; } } printdups() { int i1, i2, flags[MAXF]; long n1, n2; char *p1, *p2; for (i1 = 0; i1 < nf; i1++) flags[i1] = 0; for (i1 = 0; i1 < nf; i1++) { p1 = index(files[i1], '/'); if (!p1) continue; *p1 = '\0'; n1 = atol(p1+1); for (i2 = i1 + 1; i2 < nf; i2++) { p2 = index(files[i2], '/'); if (!p2) continue; *p2 = '\0'; if (strcmp(files[i1], files[i2]) == 0) { /* same group */ n2 = atol(p2+1); if (n2 > n1) flags[i2] = 1; /* lowest number stays */ else if (n2 < n1) flags[i1] = 1; } *p2 = '/'; n2 = atol(p2+1); } *p1 = '/'; } for (i1 = 0; i1 < nf; i1++) { if (flags[i1] == 1) { for (p1 = files[i1]; *p1; p1++) { if (*p1 == '.') putchar('/'); else putchar(*p1); } putchar('\n'); } } } ===END histdups.c===