VirtualBox

source: kBuild/trunk/src/kDeDup/kDeDup.c@ 3364

Last change on this file since 3364 was 3296, checked in by bird, 6 years ago

kDeDup: Made it work on linux (a bit ugly).

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 41.4 KB
Line 
1/* $Id: kDeDup.c 3296 2019-01-22 21:29:08Z bird $ */
2/** @file
3 * kDeDup - Utility that finds duplicate files, optionally hardlinking them.
4 */
5
6/*
7 * Copyright (c) 2016 knut st. osmundsen <[email protected]>
8 *
9 * This file is part of kBuild.
10 *
11 * kBuild is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * kBuild is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with kBuild; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#include <k/kTypes.h>
31#include <errno.h>
32#include <string.h>
33#include <stdio.h>
34#include <wchar.h>
35#if K_OS != K_OS_WINDOWS
36# include <stdlib.h>
37# include <unistd.h>
38# include <sys/fcntl.h>
39# include <sys/stat.h>
40#endif
41
42#include "md5.h"
43//#include "sha2.h"
44
45#if K_OS == K_OS_WINDOWS
46# include "nt/ntstuff.h"
47# include "nt/ntstat.h"
48# include "nt/fts-nt.h"
49# include "nt/nthlp.h"
50# include "nt/ntunlink.h"
51#else
52# include "fts.h"
53#endif
54
55
56/*********************************************************************************************************************************
57* Structures and Typedefs *
58*********************************************************************************************************************************/
59/**
60 * The key is made up of two cryptographic hashes, collisions are
61 * highly unlikely (once SHA2 is implemented).
62 */
63typedef struct KDUPFILENODEKEY
64{
65 /** The MD5 digest of the file. */
66 KU8 abMd5[16];
67 /** The 256-bit SHA-2 digest of the file. */
68 KU8 abSha2[32];
69} KDUPFILENODEKEY;
70/** Pointer to a file node.*/
71typedef struct KDUPFILENODE *PKDUPFILENODE;
72/**
73 * Hash tree node.
74 */
75typedef struct KDUPFILENODE
76{
77 /** The is made up of two hashes. */
78 KDUPFILENODEKEY mKey;
79 /** Left branch. */
80 PKDUPFILENODE mpLeft;
81 /** Right branch. */
82 PKDUPFILENODE mpRight;
83 /** Tree height (hmm). */
84 KU8 mHeight;
85
86 /** The inode number. */
87 KU64 uInode;
88 /** The device number. */
89 KU64 uDev;
90
91 /** Pointer to next hard linked node (same inode and udev values). */
92 PKDUPFILENODE pNextHardLink;
93 /** Pointer to next duplicate node. */
94 PKDUPFILENODE pNextDup;
95 /** Pointer to next duplicate node on the global list. */
96 PKDUPFILENODE pNextGlobalDup;
97
98 /** The path to this file (variable size). */
99#if K_OS == K_OS_WINDOWS
100 wchar_t wszPath[1];
101#else
102 char szPath[1];
103#endif
104} KDUPFILENODE;
105
106#if K_OS == K_OS_WINDOWS
107# define PATH_PRI "ls"
108# define PATH_MEMB wszPath
109# define FTS_ACCPATH fts_wcsaccpath
110#else
111# define PATH_PRI "s"
112# define PATH_MEMB szPath
113# define FTS_ACCPATH fts_accpath
114#endif
115
116/*#define KAVL_EQUAL_ALLOWED*/
117#define KAVL_CHECK_FOR_EQUAL_INSERT
118#define KAVL_MAX_STACK 32
119/*#define KAVL_RANGE */
120/*#define KAVL_OFFSET */
121/*#define KAVL_STD_KEY_COMP*/
122#define KAVLKEY KDUPFILENODEKEY
123#define KAVLNODE KDUPFILENODE
124#define KAVL_FN(name) kDupFileTree_ ## name
125#define KAVL_TYPE(prefix,name) prefix ## KDUPFILENODE ## name
126#define KAVL_INT(name) KDUPFILENODEINT ## name
127#define KAVL_DECL(rettype) static rettype
128#define KAVL_G(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) > 0 )
129#define KAVL_E(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) == 0 )
130#define KAVL_NE(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) != 0 )
131
132#define register
133#include <k/kAvlTmpl/kAvlBase.h>
134//#include <k/kAvlTmpl/kAvlDoWithAll.h>
135//#include <k/kAvlTmpl/kAvlEnum.h> - busted
136#include <k/kAvlTmpl/kAvlGet.h>
137//#include <k/kAvlTmpl/kAvlGetBestFit.h>
138//#include <k/kAvlTmpl/kAvlGetWithParent.h>
139//#include <k/kAvlTmpl/kAvlRemove2.h>
140//#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
141#include <k/kAvlTmpl/kAvlUndef.h>
142#undef register
143
144
145/** Pointer to a size tree node. */
146typedef struct KDUPSIZENODE *PKDUPSIZENODE;
147/**
148 * Size tree node.
149 */
150typedef struct KDUPSIZENODE
151{
152 /** The file size. */
153 KU64 mKey;
154 /** Left branch. */
155 PKDUPSIZENODE mpLeft;
156 /** Right branch. */
157 PKDUPSIZENODE mpRight;
158 /** Tree height (hmm). */
159 KU8 mHeight;
160 /** Number of files. */
161 KU32 cFiles;
162 /** Tree with same sized files.
163 * When cFiles is 1 the root node does not have hashes calculated yet. */
164 KDUPFILENODEROOT FileRoot;
165} KDUPSIZENODE;
166
167/*#define KAVL_EQUAL_ALLOWED*/
168#define KAVL_CHECK_FOR_EQUAL_INSERT
169#define KAVL_MAX_STACK 32
170/*#define KAVL_RANGE */
171/*#define KAVL_OFFSET */
172#define KAVL_STD_KEY_COMP
173#define KAVLKEY KU64
174#define KAVLNODE KDUPSIZENODE
175#define KAVL_FN(name) kDupSizeTree_ ## name
176#define KAVL_TYPE(prefix,name) prefix ## KDUPSIZENODE ## name
177#define KAVL_INT(name) KDUPSIZENODEINT ## name
178#define KAVL_DECL(rettype) static rettype
179
180#include <k/kAvlTmpl/kAvlBase.h>
181//#include <k/kAvlTmpl/kAvlDoWithAll.h>
182//#include <k/kAvlTmpl/kAvlEnum.h> - busted
183#include <k/kAvlTmpl/kAvlGet.h>
184//#include <k/kAvlTmpl/kAvlGetBestFit.h>
185//#include <k/kAvlTmpl/kAvlGetWithParent.h>
186//#include <k/kAvlTmpl/kAvlRemove2.h>
187//#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
188#include <k/kAvlTmpl/kAvlUndef.h>
189
190
191/*********************************************************************************************************************************
192* Global Variables *
193*********************************************************************************************************************************/
194/** The verbosity level. */
195static unsigned g_cVerbosity = 0;
196
197/** Whether to recurse into subdirectories. */
198static KBOOL g_fRecursive = K_FALSE;
199/** Whether to recurse into symlinked subdirectories. */
200static KBOOL g_fRecursiveViaSymlinks = K_FALSE;
201/** Whether to follow symbolicly linked files. */
202static KBOOL g_fFollowSymlinkedFiles = K_TRUE;
203
204/** Minimum file size to care about. */
205static KU64 g_cbMinFileSize = 1;
206/** Maximum file size to care about. */
207static KU64 g_cbMaxFileSize = KU64_MAX;
208
209/** The root of the size tree. */
210static KDUPSIZENODEROOT g_SizeRoot;
211
212/** Global list of duplicate file with duplicates.
213 * @remarks This only contains the files in the hash tree, not the ones on
214 * the KDUPFILENODE::pNextDup list. */
215static PKDUPFILENODE g_pDuplicateHead = NULL;
216/** Where to insert the next file with duplicates. */
217static PKDUPFILENODE *g_ppNextDuplicate = &g_pDuplicateHead;
218
219/** Number of files we're tracking. */
220static KU64 g_cFiles = 0;
221/** Number of hardlinked files or files entered more than once. */
222static KU64 g_cHardlinked = 0;
223/** Number of duplicates files (not hardlinked). */
224static KU64 g_cDuplicates = 0;
225/** Number of duplicates files that can be hardlinked. */
226static KU64 g_cDuplicatesSaved = 0;
227/** Size that could be saved if the duplicates were hardlinked. */
228static KU64 g_cbDuplicatesSaved = 0;
229
230
231
232/**
233 * Wrapper around malloc() that complains when out of memory.
234 *
235 * @returns Pointer to allocated memory
236 * @param cb The size of the memory to allocate.
237 */
238static void *kDupAlloc(KSIZE cb)
239{
240 void *pvRet = malloc(cb);
241 if (pvRet)
242 return pvRet;
243 fprintf(stderr, "kDeDup: error: out of memory! (cb=%#zx)\n", cb);
244 return NULL;
245}
246
247/** Wrapper around free() for symmetry. */
248#define kDupFree(ptr) free(ptr)
249
250#if K_OS != K_OS_WINDOWS
251/** Wrapper around read() that hides EINTR and such. */
252static ssize_t kDupReadFile(int fd, void *pvBuf, size_t cbToRead)
253{
254 ssize_t cbRet;
255 do
256 cbRet = read(fd, pvBuf, cbToRead);
257 while (cbRet < 0 && errno == EINTR);
258 if (cbRet > 0 && (size_t)cbRet != cbToRead)
259 {
260 for (;;)
261 {
262 size_t cbLeft = cbToRead - (size_t)cbRet;
263 ssize_t cbPart;
264 do
265 cbPart = read(fd, (KU8 *)pvBuf + (size_t)cbRet, cbLeft);
266 while (cbPart < 0 && errno == EINTR);
267 if (cbPart <= 0)
268 break;
269 cbRet += cbPart;
270 }
271 }
272 return cbRet;
273}
274#endif
275
276
277static void kDupHashFile(PKDUPFILENODE pFileNode, FTSENT *pFtsEnt)
278{
279 KSIZE i;
280 PKDUPFILENODE *ppHash;
281
282 /*
283 * Open the file.
284 */
285#if K_OS == K_OS_WINDOWS
286 HANDLE hFile;
287 if (pFtsEnt && pFtsEnt->fts_parent && pFtsEnt->fts_parent->fts_dirfd != INVALID_HANDLE_VALUE)
288 hFile = birdOpenFileExW(pFtsEnt->fts_parent->fts_dirfd, pFtsEnt->fts_wcsname,
289 FILE_READ_DATA | SYNCHRONIZE,
290 FILE_ATTRIBUTE_NORMAL,
291 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
292 FILE_OPEN,
293 FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
294 OBJ_CASE_INSENSITIVE);
295 else
296 hFile = birdOpenFileExW(NULL, pFileNode->wszPath,
297 FILE_READ_DATA | SYNCHRONIZE,
298 FILE_ATTRIBUTE_NORMAL,
299 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
300 FILE_OPEN,
301 FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
302 OBJ_CASE_INSENSITIVE);
303 if (hFile != INVALID_HANDLE_VALUE)
304#else /* K_OS != K_OS_WINDOWS */
305# ifdef O_BINARY
306 int fd = open(pFileNode->szPath, O_RDONLY | O_BINARY);
307# else
308 int fd = open(pFileNode->szPath, O_RDONLY);
309# endif
310 if (fd >= 0)
311#endif /* K_OS != K_OS_WINDOWS */
312 {
313 /*
314 * Init the hash calculation contexts.
315 */
316 struct MD5Context Md5Ctx;
317 //SHA256CONTEXT Sha256Ctx;
318 MD5Init(&Md5Ctx);
319 //Sha256Init(&Sha256Ctx);
320
321 /*
322 * Process the file chunk by chunk.
323 *
324 * We could complicate this by memory mapping medium sized files, but
325 * those kind of complications can wait.
326 */
327 for (;;)
328 {
329 static KU8 s_abBuffer[2*1024*1024];
330#if K_OS == K_OS_WINDOWS
331 MY_NTSTATUS rcNt;
332 MY_IO_STATUS_BLOCK Ios;
333 Ios.Information = -1;
334 Ios.u.Status = -1;
335 rcNt = g_pfnNtReadFile(hFile, NULL /*hEvent*/, NULL /*pfnApc*/, NULL /*pvApcCtx*/,
336 &Ios, s_abBuffer, sizeof(s_abBuffer), NULL /*poffFile*/, NULL /*puKey*/);
337 if (MY_NT_SUCCESS(rcNt))
338 {
339 MD5Update(&Md5Ctx, s_abBuffer, (unsigned)Ios.Information);
340 //SHA256Update(&Sha256Ctx, s_abBuffer, Ios.Information);
341 }
342 else if (rcNt != STATUS_END_OF_FILE)
343 {
344 fprintf(stderr, "kDeDup: warning: Error reading '%ls': %#x\n", pFileNode->wszPath, rcNt);
345 break;
346 }
347
348 /* Check for end of file. */
349 if ( rcNt == STATUS_END_OF_FILE
350 || Ios.Information < sizeof(s_abBuffer))
351 {
352 MD5Final(pFileNode->mKey.abMd5, &Md5Ctx);
353 //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx);
354
355 birdCloseFile(hFile);
356 return;
357 }
358#else /* K_OS != K_OS_WINDOWS */
359 ssize_t cbRead = kDupReadFile(fd, s_abBuffer, sizeof(s_abBuffer));
360 if (cbRead > 0)
361 {
362 MD5Update(&Md5Ctx, s_abBuffer, (unsigned)cbRead);
363 //SHA256Update(&Sha256Ctx, s_abBuffer, (unsigned)cbRead);
364 }
365 else if (cbRead == 0)
366 {
367 MD5Final(pFileNode->mKey.abMd5, &Md5Ctx);
368 //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx);
369 close(fd);
370 return;
371 }
372 else
373 {
374 fprintf(stderr, "kDeDup: warning: Error reading '%s': %s (%d)\n", pFileNode->szPath, strerror(errno), errno);
375 break;
376 }
377#endif /* K_OS != K_OS_WINDOWS */
378 }
379
380#if K_OS == K_OS_WINDOWS
381 birdCloseFile(hFile);
382#else
383 close(fd);
384#endif
385 }
386 else
387 fprintf(stderr, "kDeDup: warning: Failed to open '%" PATH_PRI "': %s (%d)\n",
388 pFileNode->PATH_MEMB, strerror(errno), errno);
389
390 /*
391 * Hashing failed. We fake the digests by repeating the node pointer value
392 * again and again, holding a collision with both SHA2 and MD5 with similar
393 * digest pattern for highly unlikely.
394 */
395 ppHash = (PKDUPFILENODE *)&pFileNode->mKey;
396 i = sizeof(pFileNode->mKey) / sizeof(*ppHash);
397 while (i-- > 0)
398 *ppHash++ = pFileNode;
399}
400
401
402/**
403 * Deal with one file, adding it to the tree if it matches the criteria.
404 *
405 * @returns 0 on success, non-zero on failure.
406 * @param pFtsEnt The FTS entry for the file.
407 */
408static int kDupDoFile(FTSENT *pFtsEnt)
409{
410 KU64 cbFile;
411#if K_OS == K_OS_WINDOWS
412 struct stat const *pStat = &pFtsEnt->fts_stat;
413#else
414 struct stat const *pStat = pFtsEnt->fts_statp;
415#endif
416
417 if (g_cVerbosity >= 2)
418 printf("debug: kDupDoFile(%" PATH_PRI ")\n", pFtsEnt->FTS_ACCPATH);
419
420 /*
421 * Check that it's within the size range.
422 */
423 cbFile = pStat->st_size;
424 if ( cbFile >= g_cbMinFileSize
425 && cbFile <= g_cbMaxFileSize)
426 {
427 /*
428 * Start out treating this like a unique file with a unique size, i.e.
429 * allocate all the structures we might possibly need.
430 */
431#if K_OS == K_OS_WINDOWS
432 size_t cbAccessPath = (wcslen(pFtsEnt->fts_wcsaccpath) + 1) * sizeof(wchar_t);
433#else
434 size_t cbAccessPath = strlen(pFtsEnt->fts_accpath) + 1;
435#endif
436 PKDUPFILENODE pFileNode = (PKDUPFILENODE)kDupAlloc(sizeof(*pFileNode) + cbAccessPath);
437 PKDUPSIZENODE pSizeNode = (PKDUPSIZENODE)kDupAlloc(sizeof(*pSizeNode));
438 if (!pFileNode || !pSizeNode)
439 return 3;
440 g_cFiles++;
441
442 memset(&pFileNode->mKey, 0, sizeof(pFileNode->mKey));
443 pFileNode->pNextHardLink = NULL;
444 pFileNode->pNextDup = NULL;
445 pFileNode->pNextGlobalDup = NULL;
446 pFileNode->uDev = pStat->st_dev;
447 pFileNode->uInode = pStat->st_ino;
448 memcpy(pFileNode->PATH_MEMB, pFtsEnt->FTS_ACCPATH, cbAccessPath);
449
450 pSizeNode->mKey = cbFile;
451 pSizeNode->cFiles = 1;
452 kDupFileTree_Init(&pSizeNode->FileRoot);
453 kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode);
454
455 /*
456 * Try insert it.
457 */
458 if (kDupSizeTree_Insert(&g_SizeRoot, pSizeNode))
459 { /* unique size, nothing more to do for now. */ }
460 else
461 {
462 /*
463 * More than one file with this size. We may need to hash the
464 * hash the file we encountered with this size, if this is the
465 * second one. In that case we should check for hardlinked or
466 * double entering of the file first as well.
467 */
468 kDupFree(pSizeNode);
469 pSizeNode = kDupSizeTree_Get(&g_SizeRoot, cbFile);
470 if (pSizeNode->cFiles == 1)
471 {
472 PKDUPFILENODE pFirstFileNode = pSizeNode->FileRoot.mpRoot;
473 if ( pFirstFileNode->uInode == pFileNode->uInode
474 && pFileNode->uInode != 0
475 && pFirstFileNode->uDev == pFileNode->uDev)
476 {
477 pFileNode->pNextHardLink = pFirstFileNode->pNextHardLink;
478 pFirstFileNode->pNextHardLink = pFileNode;
479 if (g_cVerbosity >= 1)
480 printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
481 pFileNode->PATH_MEMB, pFirstFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev);
482 g_cHardlinked += 1;
483 return 0;
484 }
485
486 kDupHashFile(pFirstFileNode, NULL);
487 }
488 kDupHashFile(pFileNode, pFtsEnt);
489
490 if (kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode))
491 { /* great, unique content */ }
492 else
493 {
494 /*
495 * Duplicate content. Could be hardlinked or a duplicate entry.
496 */
497 PKDUPFILENODE pDupFileNode = kDupFileTree_Get(&pSizeNode->FileRoot, pFileNode->mKey);
498 if ( pDupFileNode->uInode == pFileNode->uInode
499 && pFileNode->uInode != 0
500 && pDupFileNode->uDev == pFileNode->uDev)
501 {
502 pFileNode->pNextHardLink = pDupFileNode->pNextHardLink;
503 pDupFileNode->pNextHardLink = pFileNode;
504 if (g_cVerbosity >= 1)
505 printf("Found hardlinked: '%" PATH_PRI "' -> '%" PATH_PRI "' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
506 pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB, pFileNode->uInode, pFileNode->uDev);
507 g_cHardlinked += 1;
508 }
509 else
510 {
511 KBOOL fDifferentDev;
512
513 /* Genuinly duplicate (or inode numbers are busted). */
514 if (!pDupFileNode->pNextDup)
515 {
516 *g_ppNextDuplicate = pDupFileNode;
517 g_ppNextDuplicate = &pDupFileNode->pNextGlobalDup;
518 }
519
520 /* The list is sorted by device to better facility hardlinking later. */
521 while ( (fDifferentDev = pDupFileNode->uDev != pFileNode->uDev)
522 && pDupFileNode->pNextDup)
523 pDupFileNode = pDupFileNode->pNextDup;
524
525 pFileNode->pNextDup = pDupFileNode->pNextDup;
526 pDupFileNode->pNextDup = pFileNode;
527
528 g_cDuplicates += 1;
529 if (!fDifferentDev)
530 {
531 g_cDuplicatesSaved += 1;
532#if K_OS == K_OS_WINDOWS
533 g_cbDuplicatesSaved += pStat->st_blocks * BIRD_STAT_BLOCK_SIZE;
534#else
535 g_cbDuplicatesSaved += pStat->st_size;
536#endif
537 if (g_cVerbosity >= 1)
538 printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "'\n",
539 pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB);
540 }
541 else if (g_cVerbosity >= 1)
542 printf("Found duplicate: '%" PATH_PRI "' <-> '%" PATH_PRI "' (devices differ).\n",
543 pFileNode->PATH_MEMB, pDupFileNode->PATH_MEMB);
544 }
545 }
546 }
547 }
548 else if (g_cVerbosity >= 1)
549 printf("Skipping '%" PATH_PRI "' because %" KU64_PRI " bytes is outside the size range.\n", pFtsEnt->FTS_ACCPATH, cbFile);
550 return 0;
551}
552
553
554/**
555 * Process the non-option arguments, creating the file tree.
556 *
557 * @returns 0 on success, non-zero on failure.
558 * @param papwszFtsArgs The input in argv style.
559 * @param fFtsOptions The FTS options.
560 */
561#if K_OS == K_OS_WINDOWS
562static int kDupReadAll(wchar_t **papwszFtsArgs, unsigned fFtsOptions)
563#else
564static int kDupReadAll(char **papszFtsArgs, unsigned fFtsOptions)
565#endif
566{
567 int rcExit = 0;
568#if K_OS == K_OS_WINDOWS
569 FTS *pFts = nt_fts_openw(papwszFtsArgs, fFtsOptions, NULL /*pfnCompare*/);
570#else
571 FTS *pFts = fts_open(papszFtsArgs, fFtsOptions, NULL /*pfnCompare*/);
572#endif
573 if (pFts != NULL)
574 {
575 for (;;)
576 {
577#if K_OS == K_OS_WINDOWS
578 FTSENT *pFtsEnt = nt_fts_read(pFts);
579#else
580 FTSENT *pFtsEnt = fts_read(pFts);
581#endif
582 if (pFtsEnt)
583 {
584 switch (pFtsEnt->fts_info)
585 {
586 case FTS_F:
587 rcExit = kDupDoFile(pFtsEnt);
588 if (rcExit == 0)
589 continue;
590 break;
591
592 case FTS_D:
593 if ( g_fRecursive
594 || pFtsEnt->fts_level == FTS_ROOTLEVEL) /* enumerate dirs on the command line */
595 continue;
596#if K_OS == K_OS_WINDOWS
597 rcExit = nt_fts_set(pFts, pFtsEnt, FTS_SKIP);
598#else
599 rcExit = fts_set(pFts, pFtsEnt, FTS_SKIP);
600#endif
601 if (rcExit == 0)
602 continue;
603 fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
604 rcExit = 1;
605 break;
606
607 case FTS_DP:
608 /* nothing to do here. */
609 break;
610
611 case FTS_SL:
612 {
613#if K_OS == K_OS_WINDOWS
614 /* The nice thing on windows is that we already know whether it's a
615 directory or file when encountering the symbolic link. */
616 if ( (pFtsEnt->fts_stat.st_isdirsymlink ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles)
617 && pFtsEnt->fts_number == 0)
618#else
619 struct stat St;
620 if ( pFtsEnt->fts_number == 0
621 && ( (g_fRecursiveViaSymlinks && g_fFollowSymlinkedFiles)
622 || ( stat(pFtsEnt->fts_accpath, &St) == 0
623 && (S_ISDIR(St.st_mode) ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles))))
624#endif
625 {
626 pFtsEnt->fts_number++;
627#if K_OS == K_OS_WINDOWS
628 rcExit = nt_fts_set(pFts, pFtsEnt, FTS_FOLLOW);
629#else
630 rcExit = fts_set(pFts, pFtsEnt, FTS_FOLLOW);
631#endif
632 if (rcExit == 0)
633 continue;
634 fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
635 rcExit = 1;
636 }
637 break;
638 }
639
640 case FTS_DC:
641 fprintf(stderr, "kDeDup: warning: Ignoring cycle '%" PATH_PRI "'!\n", pFtsEnt->FTS_ACCPATH);
642 continue;
643
644 case FTS_NS:
645 fprintf(stderr, "kDeDup: warning: Failed to stat '%" PATH_PRI "': %s (%d)\n",
646 pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
647 continue;
648
649 case FTS_DNR:
650 fprintf(stderr, "kDeDup: error: Error reading directory '%" PATH_PRI "': %s (%d)\n",
651 pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
652 rcExit = 1;
653 break;
654
655 case FTS_ERR:
656 fprintf(stderr, "kDeDup: error: Error on '%" PATH_PRI "': %s (%d)\n",
657 pFtsEnt->FTS_ACCPATH, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
658 rcExit = 1;
659 break;
660
661
662 /* ignore */
663 case FTS_SLNONE:
664 case FTS_DEFAULT:
665 break;
666
667 /* Not supposed to get here. */
668 default:
669 fprintf(stderr, "kDeDup: internal error: fts_info=%d - '%" PATH_PRI "'\n",
670 pFtsEnt->fts_info, pFtsEnt->FTS_ACCPATH);
671 rcExit = 1;
672 break;
673 }
674 }
675 else if (errno == 0)
676 break;
677 else
678 {
679 fprintf(stderr, "kDeDup: error: nt_fts_read failed: %s (%d)\n", strerror(errno), errno);
680 rcExit = 1;
681 break;
682 }
683 }
684
685#if K_OS == K_OS_WINDOWS
686 if (nt_fts_close(pFts) != 0)
687#else
688 if (fts_close(pFts) != 0)
689#endif
690 {
691 fprintf(stderr, "kDeDup: error: nt_fts_close failed: %s (%d)\n", strerror(errno), errno);
692 rcExit = 1;
693 }
694 }
695 else
696 {
697 fprintf(stderr, "kDeDup: error: nt_fts_openw failed: %s (%d)\n", strerror(errno), errno);
698 rcExit = 1;
699 }
700
701 return rcExit;
702}
703
704
705/**
706 * Compares the content of the two files.
707 *
708 * @returns 0 if equal, 1 if not equal, -1 on open/read error.
709 * @param pFile1 The first file.
710 * @param pFile2 The second file.
711 */
712static int kDupCompareFiles(PKDUPFILENODE pFile1, PKDUPFILENODE pFile2)
713{
714#if K_OS == K_OS_WINDOWS
715 int rcRet = 0;
716 K_NOREF(pFile1);
717 K_NOREF(pFile2);
718 /** @todo compare files. */
719#else
720 int rcRet = -1;
721# ifdef O_BINARY
722 int fOpen = O_RDONLY | O_BINARY;
723# else
724 int fOpen = O_RDONLY;
725# endif
726 /*
727 * Open the two files.
728 */
729 int fd1 = open(pFile1->szPath, fOpen);
730 if (fd1 >= 0)
731 {
732 int fd2 = open(pFile2->szPath, fOpen);
733 if (fd1 >= 0)
734 {
735 /*
736 * Read and compare all the data.
737 */
738 static KU8 s_abBuf1[2*1024*1024];
739 static KU8 s_abBuf2[2*1024*1024];
740 KU64 off = 0;
741 for (;;)
742 {
743 ssize_t cb1 = kDupReadFile(fd1, s_abBuf1, sizeof(s_abBuf1));
744 ssize_t cb2 = kDupReadFile(fd2, s_abBuf2, sizeof(s_abBuf2));
745 if (cb1 < 0 || cb2 < 0)
746 {
747 if (cb1 < 0)
748 fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile1->szPath, strerror(errno), errno);
749 if (cb2 < 0)
750 fprintf(stderr, "kDeDup: error: reading from '%s': %s (%d)\n", pFile2->szPath, strerror(errno), errno);
751 break;
752 }
753 if (cb1 != cb2)
754 {
755 fprintf(stderr, "kDeDup: warning: '%s' now differs from '%s' in size...\n", pFile1->szPath, pFile2->szPath);
756 rcRet = 1;
757 break;
758 }
759 if (cb1 == 0)
760 {
761 rcRet = 0;
762 break;
763 }
764 if (memcmp(s_abBuf1, s_abBuf2, cb1) != 0)
765 {
766 fprintf(stderr, "kDeDup: warning: hash collision: '%s' differs from '%s' (" KX64_PRI " LB %#x)\n",
767 pFile1->szPath, pFile2->szPath, off, (unsigned)cb1);
768 rcRet = 1;
769 break;
770 }
771 off += cb1;
772 }
773
774 close(fd2);
775 }
776 close(fd1);
777 }
778#endif
779 return rcRet;
780}
781
782
783/**
784 * Hardlink duplicates.
785 */
786static int kDupHardlinkDuplicates(void)
787{
788 int rcExit = 0;
789 PKDUPFILENODE pFileNode;
790 for (pFileNode = g_pDuplicateHead; pFileNode != NULL; pFileNode = pFileNode->pNextGlobalDup)
791 {
792 PKDUPFILENODE pTargetFile = pFileNode;
793 PKDUPFILENODE pDupFile;
794 for (pDupFile = pFileNode->pNextDup; pDupFile != NULL; pDupFile = pDupFile->pNextDup)
795 {
796 /*
797 * Can only hard link if the files are on the same device.
798 */
799 if (pDupFile->uDev == pTargetFile->uDev)
800 {
801 if (kDupCompareFiles(pDupFile, pTargetFile) == 0)
802 {
803 /*
804 * Start by renaming the orinal file before we try create the hard link.
805 */
806#if K_OS == K_OS_WINDOWS
807 static const wchar_t s_wszBackupSuffix[] = L".kDepBackup";
808 wchar_t wszBackup[0x4000];
809 size_t cwcPath = wcslen(pDupFile->wszPath);
810 if (cwcPath + sizeof(s_wszBackupSuffix) / sizeof(wchar_t) < K_ELEMENTS(wszBackup))
811 {
812 memcpy(wszBackup, pDupFile->wszPath, cwcPath * sizeof(wchar_t));
813 memcpy(&wszBackup[cwcPath], s_wszBackupSuffix, sizeof(s_wszBackupSuffix));
814 if (MoveFileW(pDupFile->wszPath, wszBackup))
815 {
816 if (CreateHardLinkW(pDupFile->wszPath, pTargetFile->wszPath, NULL))
817 {
818 if (birdUnlinkForcedW(wszBackup) == 0)
819 {
820 if (g_cVerbosity >= 1)
821 printf("Hardlinked '%ls' to '%ls'.\n", pDupFile->wszPath, pTargetFile->wszPath);
822 }
823 else
824 {
825 fprintf(stderr, "kDeDup: fatal: failed to delete '%ls' after hardlinking: %s (%d)\n",
826 wszBackup, strerror(errno), errno);
827 return 8;
828 }
829 }
830 else
831 {
832 fprintf(stderr, "kDeDup: error: failed to hard link '%ls' to '%ls': %u\n",
833 pDupFile->wszPath, wszBackup, GetLastError());
834 if (!MoveFileW(wszBackup, pDupFile->wszPath))
835 {
836 fprintf(stderr, "kDeDup: fatal: Restore '%ls' to '%ls' after hardlinking failed: %u\n",
837 wszBackup, pDupFile->wszPath, GetLastError());
838 return 8;
839 }
840 rcExit = 1;
841 }
842 }
843 else
844 {
845 fprintf(stderr, "kDeDup: error: failed to rename '%ls' to '%ls': %u\n",
846 pDupFile->wszPath, wszBackup, GetLastError());
847 rcExit = 1;
848 }
849 }
850 else
851 {
852 fprintf(stderr, "kDeDup: error: too long backup path: '%ls'\n", pDupFile->wszPath);
853 rcExit = 1;
854 }
855#else /* K_OS != K_OS_WINDOWS */
856 static const char s_szBackupSuffix[] = ".kDepBackup";
857 char szBackup[0x4000];
858 size_t cchPath = strlen(pDupFile->szPath);
859 if (cchPath + sizeof(s_szBackupSuffix) < sizeof(szBackup))
860 {
861 struct stat StTmp;
862 memcpy(szBackup, pDupFile->szPath, cchPath);
863 memcpy(&szBackup[cchPath], s_szBackupSuffix, sizeof(s_szBackupSuffix));
864 if (stat(szBackup, &StTmp) != 0)
865 {
866 if (rename(pDupFile->szPath, szBackup) == 0)
867 {
868 if (link(pTargetFile->szPath, pDupFile->szPath) == 0)
869 {
870 if (unlink(szBackup) == 0)
871 {
872 if (g_cVerbosity >= 1)
873 printf("Hardlinked '%s' to '%s'.\n", pDupFile->szPath, pTargetFile->szPath);
874 }
875 else
876 {
877 fprintf(stderr, "kDeDup: fatal: failed to delete '%s' after hardlinking: %s (%d)\n",
878 szBackup, strerror(errno), errno);
879 return 8;
880 }
881 }
882 else
883 {
884 fprintf(stderr, "kDeDup: error: failed to hard link '%s' to '%s': %s (%d)\n",
885 pDupFile->szPath, szBackup, strerror(errno), errno);
886 if (rename(szBackup, pDupFile->szPath) != 0)
887 {
888 fprintf(stderr, "kDeDup: fatal: Restore '%s' to '%s' after hardlinking failed: %s (%d)\n",
889 szBackup, pDupFile->szPath, strerror(errno), errno);
890 return 8;
891 }
892 rcExit = 1;
893 }
894 }
895 else
896 {
897 fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': %s (%d)\n",
898 pDupFile->szPath, szBackup, strerror(errno), errno);
899 rcExit = 1;
900 }
901 }
902 else
903 {
904 fprintf(stderr, "kDeDup: error: failed to rename '%s' to '%s': file already exist (st_mode=%#x)\n",
905 pDupFile->szPath, szBackup, StTmp.st_mode);
906 rcExit = 1;
907 }
908 }
909 else
910 {
911 fprintf(stderr, "kDeDup: error: too long backup path: '%s'\n", pDupFile->szPath);
912 rcExit = 1;
913 }
914#endif /* K_OS != K_OS_WINDOWS */
915 }
916 }
917 /*
918 * Since the list is sorted by uDev, we now change the target file.
919 */
920 else
921 pTargetFile = pDupFile;
922 }
923 }
924 return rcExit;
925}
926
927
928static int usage(const char *pszName, FILE *pOut)
929{
930 fprintf(pOut,
931 "usage: %s [options] <path1> [path2 [..]]\n"
932 "usage: %s <-V|--version>\n"
933 "usage: %s <-h|--help>\n"
934 , pszName, pszName, pszName);
935 fprintf(pOut,
936 "\n"
937 "Options:\n"
938 " -H, --dereference-command-line, --no-dereference-command-line\n"
939 " Follow symbolic links on the command line.\n"
940 " -L, --dereference\n"
941 " Follow symbolic links while scanning directories.\n"
942 " -P, --no-dereference\n"
943 " Do not follow symbolic links while scanning directories.\n"
944 " -r, --recursive\n"
945 " Recurse into subdirectories, but do not follow links to them.\n"
946 " -R, --recursive-dereference\n"
947 " Same as -r, but also follow into symlinked subdirectories.\n"
948 " -x, --one-file-system\n"
949 " Do not consider other file system (volumes), either down thru a\n"
950 " mount point or via a symbolic link to a directory.\n"
951 " --no-one-file-system, --cross-file-systems\n"
952 " Reverses the effect of --one-file-system.\n"
953 " -q, --quiet, -v,--verbose\n"
954 " Controls the output level.\n"
955 " --hardlink-duplicates\n"
956 " Hardlink duplicate files to remove duplicates and save space. By default\n"
957 " no action is taken and only analysis is done.\n"
958 );
959 return 0;
960}
961
962
963#if K_OS == K_OS_WINDOWS
964int wmain(int argc, wchar_t **argv)
965#else
966int main(int argc, char **argv)
967#endif
968{
969 int rcExit;
970
971 /*
972 * Process parameters. Position.
973 */
974 unsigned cFtsArgs = 0;
975#if K_OS == K_OS_WINDOWS
976 wchar_t **papwszFtsArgs = (wchar_t **)calloc(argc + 1, sizeof(wchar_t *));
977 unsigned fFtsOptions = FTS_NOCHDIR | FTS_NO_ANSI;
978#else
979 char **papszFtsArgs = (char **)calloc(argc + 1, sizeof(char *));
980 unsigned fFtsOptions = FTS_NOCHDIR;
981#endif
982 KBOOL fEndOfOptions = K_FALSE;
983 KBOOL fHardlinkDups = K_FALSE;
984 int i;
985 for (i = 1; i < argc; i++)
986 {
987#if K_OS == K_OS_WINDOWS
988 wchar_t *pwszArg = argv[i];
989 if ( *pwszArg == '-'
990 && !fEndOfOptions)
991#else
992 char *pszArg = argv[i];
993 if ( *pszArg == '-'
994 && !fEndOfOptions)
995#endif
996 {
997#if K_OS != K_OS_WINDOWS
998 wchar_t wszOpt[1024] = { 0 };
999 wchar_t *pwszArg = wszOpt;
1000 mbsrtowcs(wszOpt, (const char **)&pszArg, 1024 - 1, NULL);
1001#endif
1002 wchar_t wcOpt = *++pwszArg;
1003 pwszArg++;
1004 if (wcOpt == '-')
1005 {
1006 /* Translate long options. */
1007 if (wcscmp(pwszArg, L"help") == 0)
1008 wcOpt = 'h';
1009 else if (wcscmp(pwszArg, L"version") == 0)
1010 wcOpt = 'V';
1011 else if (wcscmp(pwszArg, L"recursive") == 0)
1012 wcOpt = 'r';
1013 else if (wcscmp(pwszArg, L"dereference-recursive") == 0)
1014 wcOpt = 'R';
1015 else if (wcscmp(pwszArg, L"dereference") == 0)
1016 wcOpt = 'L';
1017 else if (wcscmp(pwszArg, L"dereference-command-line") == 0)
1018 wcOpt = 'H';
1019 else if (wcscmp(pwszArg, L"one-file-system") == 0)
1020 wcOpt = 'x';
1021 /* Process long options. */
1022 else if (*pwszArg == '\0')
1023 {
1024 fEndOfOptions = K_TRUE;
1025 continue;
1026 }
1027 else if (wcscmp(pwszArg, L"no-recursive") == 0)
1028 {
1029 g_fRecursive = g_fRecursiveViaSymlinks = K_FALSE;
1030 continue;
1031 }
1032 else if (wcscmp(pwszArg, L"no-dereference-command-line") == 0)
1033 {
1034 fFtsOptions &= ~FTS_COMFOLLOW;
1035 continue;
1036 }
1037 else if ( wcscmp(pwszArg, L"no-one-file-system") == 0
1038 || wcscmp(pwszArg, L"cross-file-systems") == 0)
1039 {
1040 fFtsOptions &= ~FTS_XDEV;
1041 continue;
1042 }
1043 else if (wcscmp(pwszArg, L"hardlink-duplicates") == 0)
1044 {
1045 fHardlinkDups = K_TRUE;
1046 continue;
1047 }
1048 else
1049 {
1050 fprintf(stderr, "kDeDup: syntax error: Unknown option '--%ls'\n", pwszArg);
1051 return 2;
1052 }
1053 }
1054
1055 /* Process one or more short options. */
1056 do
1057 {
1058 switch (wcOpt)
1059 {
1060 case 'r': /* --recursive */
1061 g_fRecursive = K_TRUE;
1062 break;
1063
1064 case 'R': /* --dereference-recursive */
1065 g_fRecursive = g_fRecursiveViaSymlinks = K_TRUE;
1066 break;
1067
1068 case 'H': /* --dereference-command-line */
1069 fFtsOptions |= FTS_COMFOLLOW;
1070 break;
1071
1072 case 'L': /* --dereference*/
1073 g_fFollowSymlinkedFiles = K_TRUE;
1074 break;
1075
1076 case 'x': /* --one-file-system*/
1077 fFtsOptions |= FTS_XDEV;
1078 break;
1079
1080 case 'q':
1081 g_cVerbosity = 0;
1082 break;
1083
1084 case 'v':
1085 g_cVerbosity++;
1086 break;
1087
1088
1089 case 'h':
1090 case '?':
1091 return usage("kDeDup", stdout);
1092
1093 case 'V':
1094 printf("0.0.1\n");
1095 return 0;
1096
1097 default:
1098#if K_OS == K_OS_WINDOWS
1099 fprintf(stderr, "kDeDup: syntax error: Unknown option '-%lc'\n", wcOpt);
1100#else
1101 fprintf(stderr, "kDeDup: syntax error: Unknown option '-%c'\n", (int)wcOpt);
1102#endif
1103 return 2;
1104 }
1105
1106 wcOpt = *pwszArg++;
1107 } while (wcOpt != '\0');
1108 }
1109 else
1110 {
1111 /*
1112 * Append non-option arguments to the FTS argument vector.
1113 */
1114#if K_OS == K_OS_WINDOWS
1115 papwszFtsArgs[cFtsArgs] = pwszArg;
1116#else
1117 papszFtsArgs[cFtsArgs] = pszArg;
1118#endif
1119 cFtsArgs++;
1120 }
1121 }
1122
1123 /*
1124 * Do the FTS processing.
1125 */
1126 kDupSizeTree_Init(&g_SizeRoot);
1127#if K_OS == K_OS_WINDOWS
1128 rcExit = kDupReadAll(papwszFtsArgs, fFtsOptions);
1129#else
1130 rcExit = kDupReadAll(papszFtsArgs, fFtsOptions);
1131#endif
1132 if (rcExit == 0)
1133 {
1134 /*
1135 * Display the result.
1136 */
1137 printf("Found %" KU64_PRI " duplicate files, out which %" KU64_PRI " can be hardlinked saving %" KU64_PRI " bytes\n",
1138 g_cDuplicates, g_cDuplicatesSaved, g_cbDuplicatesSaved);
1139
1140 if (fHardlinkDups)
1141 rcExit = kDupHardlinkDuplicates();
1142 }
1143
1144 K_NOREF(kDupFileTree_Remove);
1145 K_NOREF(kDupSizeTree_Remove);
1146 return rcExit;
1147}
1148
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette