VirtualBox

source: kBuild/trunk/src/kDeDup/kDeDup.c@ 3140

Last change on this file since 3140 was 3129, checked in by bird, 7 years ago

kDeDup: Help update and changed default verbosity to --quiet.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 31.1 KB
Line 
1/* $Id: kDeDup.c 3129 2018-01-06 13:55:09Z bird $ */
2/** @file
3 * kDeDup - Utility that finds duplicate files, optionally hardlinking them.
4 */
5
6/*
7 * Copyright (c) 2016 knut st. osmundsen <bird-kBuild-spamx@anduin.net>
8 *
9 * This file is part of kBuild.
10 *
11 * kBuild is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * kBuild is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with kBuild; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 *
25 */
26
27/*******************************************************************************
28* Header Files *
29*******************************************************************************/
30#include <k/kTypes.h>
31//#include <stdlib.h>
32#include <wchar.h>
33#include <string.h>
34#include <stdio.h>
35
36#include "md5.h"
37//#include "sha2.h"
38
39#include "nt/ntstuff.h"
40#include "nt/ntstat.h"
41#include "nt/fts-nt.h"
42#include "nt/nthlp.h"
43#include "nt/ntunlink.h"
44
45
46/*********************************************************************************************************************************
47* Structures and Typedefs *
48*********************************************************************************************************************************/
49/**
50 * The key is made up of two cryptographic hashes, collisions are
51 * highly unlikely (once SHA2 is implemented).
52 */
53typedef struct KDUPFILENODEKEY
54{
55 /** The MD5 digest of the file. */
56 KU8 abMd5[16];
57 /** The 256-bit SHA-2 digest of the file. */
58 KU8 abSha2[32];
59} KDUPFILENODEKEY;
60/** Pointer to a file node.*/
61typedef struct KDUPFILENODE *PKDUPFILENODE;
62/**
63 * Hash tree node.
64 */
65typedef struct KDUPFILENODE
66{
67 /** The is made up of two hashes. */
68 KDUPFILENODEKEY mKey;
69 /** Left branch. */
70 PKDUPFILENODE mpLeft;
71 /** Right branch. */
72 PKDUPFILENODE mpRight;
73 /** Tree height (hmm). */
74 KU8 mHeight;
75
76 /** The inode number. */
77 KU64 uInode;
78 /** The device number. */
79 KU64 uDev;
80
81 /** Pointer to next hard linked node (same inode and udev values). */
82 PKDUPFILENODE pNextHardLink;
83 /** Pointer to next duplicate node. */
84 PKDUPFILENODE pNextDup;
85 /** Pointer to next duplicate node on the global list. */
86 PKDUPFILENODE pNextGlobalDup;
87
88 /** The path to this file (variable size). */
89 wchar_t wszPath[1];
90} KDUPFILENODE;
91
92/*#define KAVL_EQUAL_ALLOWED*/
93#define KAVL_CHECK_FOR_EQUAL_INSERT
94#define KAVL_MAX_STACK 32
95/*#define KAVL_RANGE */
96/*#define KAVL_OFFSET */
97/*#define KAVL_STD_KEY_COMP*/
98#define KAVLKEY KDUPFILENODEKEY
99#define KAVLNODE KDUPFILENODE
100#define KAVL_FN(name) kDupFileTree_ ## name
101#define KAVL_TYPE(prefix,name) prefix ## KDUPFILENODE ## name
102#define KAVL_INT(name) KDUPFILENODEINT ## name
103#define KAVL_DECL(rettype) static rettype
104#define KAVL_G(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) > 0 )
105#define KAVL_E(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) == 0 )
106#define KAVL_NE(key1, key2) ( memcmp(&(key1), &(key2), sizeof(KDUPFILENODEKEY)) != 0 )
107
108#define register
109#include <k/kAvlTmpl/kAvlBase.h>
110#include <k/kAvlTmpl/kAvlDoWithAll.h>
111//#include <k/kAvlTmpl/kAvlEnum.h> - busted
112#include <k/kAvlTmpl/kAvlGet.h>
113#include <k/kAvlTmpl/kAvlGetBestFit.h>
114#include <k/kAvlTmpl/kAvlGetWithParent.h>
115#include <k/kAvlTmpl/kAvlRemove2.h>
116#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
117#include <k/kAvlTmpl/kAvlUndef.h>
118#undef register
119
120
121/** Pointer to a size tree node. */
122typedef struct KDUPSIZENODE *PKDUPSIZENODE;
123/**
124 * Size tree node.
125 */
126typedef struct KDUPSIZENODE
127{
128 /** The file size. */
129 KU64 mKey;
130 /** Left branch. */
131 PKDUPSIZENODE mpLeft;
132 /** Right branch. */
133 PKDUPSIZENODE mpRight;
134 /** Tree height (hmm). */
135 KU8 mHeight;
136 /** Number of files. */
137 KU32 cFiles;
138 /** Tree with same sized files.
139 * When cFiles is 1 the root node does not have hashes calculated yet. */
140 KDUPFILENODEROOT FileRoot;
141} KDUPSIZENODE;
142
143/*#define KAVL_EQUAL_ALLOWED*/
144#define KAVL_CHECK_FOR_EQUAL_INSERT
145#define KAVL_MAX_STACK 32
146/*#define KAVL_RANGE */
147/*#define KAVL_OFFSET */
148#define KAVL_STD_KEY_COMP
149#define KAVLKEY KU64
150#define KAVLNODE KDUPSIZENODE
151#define KAVL_FN(name) kDupSizeTree_ ## name
152#define KAVL_TYPE(prefix,name) prefix ## KDUPSIZENODE ## name
153#define KAVL_INT(name) KDUPSIZENODEINT ## name
154#define KAVL_DECL(rettype) static rettype
155
156#include <k/kAvlTmpl/kAvlBase.h>
157#include <k/kAvlTmpl/kAvlDoWithAll.h>
158//#include <k/kAvlTmpl/kAvlEnum.h> - busted
159#include <k/kAvlTmpl/kAvlGet.h>
160#include <k/kAvlTmpl/kAvlGetBestFit.h>
161#include <k/kAvlTmpl/kAvlGetWithParent.h>
162#include <k/kAvlTmpl/kAvlRemove2.h>
163#include <k/kAvlTmpl/kAvlRemoveBestFit.h>
164#include <k/kAvlTmpl/kAvlUndef.h>
165
166
167/*********************************************************************************************************************************
168* Global Variables *
169*********************************************************************************************************************************/
170/** The verbosity level. */
171static unsigned g_cVerbosity = 0;
172
173/** Whether to recurse into subdirectories. */
174static KBOOL g_fRecursive = K_FALSE;
175/** Whether to recurse into symlinked subdirectories. */
176static KBOOL g_fRecursiveViaSymlinks = K_FALSE;
177/** Whether to follow symbolicly linked files. */
178static KBOOL g_fFollowSymlinkedFiles = K_TRUE;
179
180/** Minimum file size to care about. */
181static KU64 g_cbMinFileSize = 1;
182/** Maximum file size to care about. */
183static KU64 g_cbMaxFileSize = KU64_MAX;
184
185/** The root of the size tree. */
186static KDUPSIZENODEROOT g_SizeRoot;
187
188/** Global list of duplicate file with duplicates.
189 * @remarks This only contains the files in the hash tree, not the ones on
190 * the KDUPFILENODE::pNextDup list. */
191static PKDUPFILENODE g_pDuplicateHead = NULL;
192/** Where to insert the next file with duplicates. */
193static PKDUPFILENODE *g_ppNextDuplicate = &g_pDuplicateHead;
194
195/** Number of files we're tracking. */
196static KU64 g_cFiles = 0;
197/** Number of hardlinked files or files entered more than once. */
198static KU64 g_cHardlinked = 0;
199/** Number of duplicates files (not hardlinked). */
200static KU64 g_cDuplicates = 0;
201/** Number of duplicates files that can be hardlinked. */
202static KU64 g_cDuplicatesSaved = 0;
203/** Size that could be saved if the duplicates were hardlinked. */
204static KU64 g_cbDuplicatesSaved = 0;
205
206
207
208/**
209 * Wrapper around malloc() that complains when out of memory.
210 *
211 * @returns Pointer to allocated memory
212 * @param cb The size of the memory to allocate.
213 */
214static void *kDupAlloc(KSIZE cb)
215{
216 void *pvRet = malloc(cb);
217 if (pvRet)
218 return pvRet;
219 fprintf(stderr, "kDeDup: error: out of memory! (cb=%#z)\n", cb);
220 return NULL;
221}
222
223/** Wrapper around free() for symmetry. */
224#define kDupFree(ptr) free(ptr)
225
226
227static void kDupHashFile(PKDUPFILENODE pFileNode, FTSENT *pFtsEnt)
228{
229 KSIZE i;
230 PKDUPFILENODE *ppHash;
231
232 /*
233 * Open the file.
234 */
235 HANDLE hFile;
236 if (pFtsEnt && pFtsEnt->fts_parent && pFtsEnt->fts_parent->fts_dirfd != INVALID_HANDLE_VALUE)
237 hFile = birdOpenFileExW(pFtsEnt->fts_parent->fts_dirfd, pFtsEnt->fts_wcsname,
238 FILE_READ_DATA | SYNCHRONIZE,
239 FILE_ATTRIBUTE_NORMAL,
240 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
241 FILE_OPEN,
242 FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
243 OBJ_CASE_INSENSITIVE);
244 else
245 hFile = birdOpenFileExW(NULL, pFileNode->wszPath,
246 FILE_READ_DATA | SYNCHRONIZE,
247 FILE_ATTRIBUTE_NORMAL,
248 FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
249 FILE_OPEN,
250 FILE_NON_DIRECTORY_FILE | FILE_OPEN_FOR_BACKUP_INTENT | FILE_SYNCHRONOUS_IO_NONALERT,
251 OBJ_CASE_INSENSITIVE);
252 if (hFile != INVALID_HANDLE_VALUE)
253 {
254 /*
255 * Init the hash calculation contexts.
256 */
257 struct MD5Context Md5Ctx;
258 //SHA256CONTEXT Sha256Ctx;
259 MD5Init(&Md5Ctx);
260 //Sha256Init(&Sha256Ctx);
261
262 /*
263 * Process the file chunk by chunk.
264 *
265 * We could complicate this by memory mapping medium sized files, but
266 * those kind of complications can wait.
267 */
268 for (;;)
269 {
270 static KU8 s_abBuffer[2*1024*1024];
271 MY_NTSTATUS rcNt;
272 MY_IO_STATUS_BLOCK Ios;
273 Ios.Information = -1;
274 Ios.u.Status = -1;
275 rcNt = g_pfnNtReadFile(hFile, NULL /*hEvent*/, NULL /*pfnApc*/, NULL /*pvApcCtx*/,
276 &Ios, s_abBuffer, sizeof(s_abBuffer), NULL /*poffFile*/, NULL /*puKey*/);
277 if (MY_NT_SUCCESS(rcNt))
278 {
279 MD5Update(&Md5Ctx, s_abBuffer, (unsigned)Ios.Information);
280 //SHA256Update(&Sha256Ctx, s_abBuffer, Ios.Information);
281 }
282 else if (rcNt != STATUS_END_OF_FILE)
283 {
284 fprintf(stderr, "kDeDup: warning: Error reading '%ls': %#x\n", pFileNode->wszPath, rcNt);
285 break;
286 }
287
288 /* Check for end of file. */
289 if ( rcNt == STATUS_END_OF_FILE
290 || Ios.Information < sizeof(s_abBuffer))
291 {
292 MD5Final(pFileNode->mKey.abMd5, &Md5Ctx);
293 //Sha256Final(pFileNode->mKey.abSha2, &Sha256Ctx);
294
295 birdCloseFile(hFile);
296 return;
297 }
298 }
299
300 birdCloseFile(hFile);
301 }
302 else
303 fprintf(stderr, "kDeDup: warning: Failed to open '%ls': %s (%d)\n", pFileNode->wszPath, strerror(errno), errno);
304
305 /*
306 * Hashing failed. We fake the digests by repeating the node pointer value
307 * again and again, holding a collision with both SHA2 and MD5 with similar
308 * digest pattern for highly unlikely.
309 */
310 ppHash = (PKDUPFILENODE *)&pFileNode->mKey;
311 i = sizeof(pFileNode->mKey) / sizeof(*ppHash);
312 while (i-- > 0)
313 *ppHash++ = pFileNode;
314}
315
316
317/**
318 * Deal with one file, adding it to the tree if it matches the criteria.
319 *
320 * @returns 0 on success, non-zero on failure.
321 * @param pFtsEnt The FTS entry for the file.
322 */
323static int kDupDoFile(FTSENT *pFtsEnt)
324{
325 KU64 cbFile;
326
327 if (g_cVerbosity >= 2)
328 printf("debug: kDupDoFile(%ls)\n", pFtsEnt->fts_wcsaccpath);
329
330 /*
331 * Check that it's within the size range.
332 */
333 cbFile = pFtsEnt->fts_stat.st_size;
334 if ( cbFile >= g_cbMinFileSize
335 && cbFile <= g_cbMaxFileSize)
336 {
337 /*
338 * Start out treating this like a unique file with a unique size, i.e.
339 * allocate all the structures we might possibly need.
340 */
341 size_t cbAccessPath = (wcslen(pFtsEnt->fts_wcsaccpath) + 1) * sizeof(wchar_t);
342 PKDUPFILENODE pFileNode = (PKDUPFILENODE)kDupAlloc(sizeof(*pFileNode) + cbAccessPath);
343 PKDUPSIZENODE pSizeNode = (PKDUPSIZENODE)kDupAlloc(sizeof(*pSizeNode));
344 if (!pFileNode || !pSizeNode)
345 return 3;
346 g_cFiles++;
347
348 memset(&pFileNode->mKey, 0, sizeof(pFileNode->mKey));
349 pFileNode->pNextHardLink = NULL;
350 pFileNode->pNextDup = NULL;
351 pFileNode->pNextGlobalDup = NULL;
352 pFileNode->uDev = pFtsEnt->fts_stat.st_dev;
353 pFileNode->uInode = pFtsEnt->fts_stat.st_ino;
354 memcpy(pFileNode->wszPath, pFtsEnt->fts_wcsaccpath, cbAccessPath);
355
356 pSizeNode->mKey = cbFile;
357 pSizeNode->cFiles = 1;
358 kDupFileTree_Init(&pSizeNode->FileRoot);
359 kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode);
360
361 /*
362 * Try insert it.
363 */
364 if (kDupSizeTree_Insert(&g_SizeRoot, pSizeNode))
365 { /* unique size, nothing more to do for now. */ }
366 else
367 {
368 /*
369 * More than one file with this size. We may need to hash the
370 * hash the file we encountered with this size, if this is the
371 * second one. In that case we should check for hardlinked or
372 * double entering of the file first as well.
373 */
374 kDupFree(pSizeNode);
375 pSizeNode = kDupSizeTree_Get(&g_SizeRoot, cbFile);
376 if (pSizeNode->cFiles == 1)
377 {
378 PKDUPFILENODE pFirstFileNode = pSizeNode->FileRoot.mpRoot;
379 if ( pFirstFileNode->uInode == pFileNode->uInode
380 && pFileNode->uInode != 0
381 && pFirstFileNode->uDev == pFileNode->uDev)
382 {
383 pFileNode->pNextHardLink = pFirstFileNode->pNextHardLink;
384 pFirstFileNode->pNextHardLink = pFileNode;
385 if (g_cVerbosity >= 1)
386 printf("Found hardlinked: '%ls' -> '%ls' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
387 pFileNode->wszPath, pFirstFileNode->wszPath, pFileNode->uInode, pFileNode->uDev);
388 g_cHardlinked += 1;
389 return 0;
390 }
391
392 kDupHashFile(pFirstFileNode, NULL);
393 }
394 kDupHashFile(pFileNode, pFtsEnt);
395
396 if (kDupFileTree_Insert(&pSizeNode->FileRoot, pFileNode))
397 { /* great, unique content */ }
398 else
399 {
400 /*
401 * Duplicate content. Could be hardlinked or a duplicate entry.
402 */
403 PKDUPFILENODE pDupFileNode = kDupFileTree_Get(&pSizeNode->FileRoot, pFileNode->mKey);
404 if ( pDupFileNode->uInode == pFileNode->uInode
405 && pFileNode->uInode != 0
406 && pDupFileNode->uDev == pFileNode->uDev)
407 {
408 pFileNode->pNextHardLink = pDupFileNode->pNextHardLink;
409 pDupFileNode->pNextHardLink = pFileNode;
410 if (g_cVerbosity >= 1)
411 printf("Found hardlinked: '%ls' -> '%ls' (ino:%#" KX64_PRI " dev:%#" KX64_PRI ")\n",
412 pFileNode->wszPath, pDupFileNode->wszPath, pFileNode->uInode, pFileNode->uDev);
413 g_cHardlinked += 1;
414 }
415 else
416 {
417 KBOOL fDifferentDev;
418
419 /* Genuinly duplicate (or inode numbers are busted). */
420 if (!pDupFileNode->pNextDup)
421 {
422 *g_ppNextDuplicate = pDupFileNode;
423 g_ppNextDuplicate = &pDupFileNode->pNextGlobalDup;
424 }
425
426 /* The list is sorted by device to better facility hardlinking later. */
427 while ( (fDifferentDev = pDupFileNode->uDev != pFileNode->uDev)
428 && pDupFileNode->pNextDup)
429 pDupFileNode = pDupFileNode->pNextDup;
430
431 pFileNode->pNextDup = pDupFileNode->pNextDup;
432 pDupFileNode->pNextDup = pFileNode;
433
434 g_cDuplicates += 1;
435 if (!fDifferentDev)
436 {
437 g_cDuplicatesSaved += 1;
438 g_cbDuplicatesSaved += pFtsEnt->fts_stat.st_blocks * BIRD_STAT_BLOCK_SIZE;
439 if (g_cVerbosity >= 1)
440 printf("Found duplicate: '%ls' <-> '%ls'\n", pFileNode->wszPath, pDupFileNode->wszPath);
441 }
442 else if (g_cVerbosity >= 1)
443 printf("Found duplicate: '%ls' <-> '%ls' (devices differ).\n", pFileNode->wszPath, pDupFileNode->wszPath);
444 }
445 }
446 }
447 }
448 else if (g_cVerbosity >= 1)
449 printf("Skipping '%ls' because %" KU64_PRI " bytes is outside the size range.\n",
450 pFtsEnt->fts_wcsaccpath, cbFile);
451 return 0;
452}
453
454
455/**
456 * Process the non-option arguments, creating the file tree.
457 *
458 * @returns 0 on success, non-zero on failure.
459 * @param papwszFtsArgs The input in argv style.
460 * @param fFtsOptions The FTS options.
461 */
462static int kDupReadAll(wchar_t **papwszFtsArgs, unsigned fFtsOptions)
463{
464 int rcExit = 0;
465 FTS *pFts = nt_fts_openw(papwszFtsArgs, fFtsOptions, NULL /*pfnCompare*/);
466 if (pFts != NULL)
467 {
468 for (;;)
469 {
470 FTSENT *pFtsEnt = nt_fts_read(pFts);
471 if (pFtsEnt)
472 {
473 switch (pFtsEnt->fts_info)
474 {
475 case FTS_F:
476 rcExit = kDupDoFile(pFtsEnt);
477 if (rcExit == 0)
478 continue;
479 break;
480
481 case FTS_D:
482 if ( g_fRecursive
483 || pFtsEnt->fts_level == FTS_ROOTLEVEL) /* enumerate dirs on the command line */
484 continue;
485 rcExit = nt_fts_set(pFts, pFtsEnt, FTS_SKIP);
486 if (rcExit == 0)
487 continue;
488 fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
489 rcExit = 1;
490 break;
491
492 case FTS_DP:
493 /* nothing to do here. */
494 break;
495
496 case FTS_SL:
497 /* The nice thing on windows is that we already know whether it's a
498 directory or file when encountering the symbolic link. */
499 if ( (pFtsEnt->fts_stat.st_isdirsymlink ? g_fRecursiveViaSymlinks : g_fFollowSymlinkedFiles)
500 && pFtsEnt->fts_number == 0)
501 {
502 pFtsEnt->fts_number++;
503 rcExit = nt_fts_set(pFts, pFtsEnt, FTS_FOLLOW);
504 if (rcExit == 0)
505 continue;
506 fprintf(stderr, "kDeDup: internal error: nt_fts_set failed!\n");
507 rcExit = 1;
508 }
509 break;
510
511 case FTS_DC:
512 fprintf(stderr, "kDeDup: warning: Ignoring cycle '%ls'!\n", pFtsEnt->fts_wcsaccpath);
513 continue;
514
515 case FTS_NS:
516 fprintf(stderr, "kDeDup: warning: Failed to stat '%ls': %s (%d)\n",
517 pFtsEnt->fts_wcsaccpath, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
518 continue;
519
520 case FTS_DNR:
521 fprintf(stderr, "kDeDup: error: Error reading directory '%ls': %s (%d)\n",
522 pFtsEnt->fts_wcsaccpath, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
523 rcExit = 1;
524 break;
525
526 case FTS_ERR:
527 fprintf(stderr, "kDeDup: error: Error on '%ls': %s (%d)\n",
528 pFtsEnt->fts_wcsaccpath, strerror(pFtsEnt->fts_errno), pFtsEnt->fts_errno);
529 rcExit = 1;
530 break;
531
532
533 /* ignore */
534 case FTS_SLNONE:
535 case FTS_DEFAULT:
536 break;
537
538 /* Not supposed to get here. */
539 default:
540 fprintf(stderr, "kDeDup: internal error: fts_info=%d - '%ls'\n",
541 pFtsEnt->fts_info, pFtsEnt->fts_wcsaccpath);
542 rcExit = 1;
543 break;
544 }
545 }
546 else if (errno == 0)
547 break;
548 else
549 {
550 fprintf(stderr, "kDeDup: error: nt_fts_read failed: %s (%d)\n", strerror(errno), errno);
551 rcExit = 1;
552 break;
553 }
554 }
555
556 if (nt_fts_close(pFts) != 0)
557 {
558 fprintf(stderr, "kDeDup: error: nt_fts_close failed: %s (%d)\n", strerror(errno), errno);
559 rcExit = 1;
560 }
561 }
562 else
563 {
564 fprintf(stderr, "kDeDup: error: nt_fts_openw failed: %s (%d)\n", strerror(errno), errno);
565 rcExit = 1;
566 }
567
568 return rcExit;
569}
570
571
572/**
573 * Hardlink duplicates.
574 */
575static int kDupHardlinkDuplicates(void)
576{
577 int rcExit = 0;
578 PKDUPFILENODE pFileNode;
579 for (pFileNode = g_pDuplicateHead; pFileNode != NULL; pFileNode = pFileNode->pNextGlobalDup)
580 {
581 PKDUPFILENODE pTargetFile = pFileNode;
582 PKDUPFILENODE pDupFile;
583 for (pDupFile = pFileNode->pNextDup; pDupFile != NULL; pDupFile = pDupFile->pNextDup)
584 {
585 /*
586 * Can only hard link if the files are on the same device.
587 */
588 if (pDupFile->uDev == pTargetFile->uDev)
589 {
590 /** @todo compare the files? */
591 if (1)
592 {
593 /*
594 * Start by renaming the orinal file before we try create the hard link.
595 */
596 static const wchar_t s_wszBackupSuffix[] = L".kDepBackup";
597 wchar_t wszBackup[0x4000];
598 size_t cwcPath = wcslen(pDupFile->wszPath);
599 if (cwcPath + sizeof(s_wszBackupSuffix) / sizeof(wchar_t) < K_ELEMENTS(wszBackup))
600 {
601 memcpy(wszBackup, pDupFile->wszPath, cwcPath * sizeof(wchar_t));
602 memcpy(&wszBackup[cwcPath], s_wszBackupSuffix, sizeof(s_wszBackupSuffix));
603 if (MoveFileW(pDupFile->wszPath, wszBackup))
604 {
605 if (CreateHardLinkW(pDupFile->wszPath, pTargetFile->wszPath, NULL))
606 {
607 if (birdUnlinkForcedW(wszBackup) == 0)
608 {
609 if (g_cVerbosity >= 1)
610 printf("Hardlinked '%ls' to '%ls'.\n", pDupFile->wszPath, pTargetFile->wszPath);
611 }
612 else
613 {
614 fprintf(stderr, "kDeDup: fatal: failed to delete '%ls' after hardlinking: %s (%d)\n",
615 wszBackup, strerror(errno), errno);
616 return 8;
617 }
618 }
619 else
620 {
621 fprintf(stderr, "kDeDup: error: failed to hard link '%ls' to '%ls': %u\n",
622 pDupFile->wszPath, wszBackup, GetLastError());
623 if (!MoveFileW(wszBackup, pDupFile->wszPath))
624 {
625 fprintf(stderr, "kDeDup: fatal: Restore back '%ls' to '%ls' after hardlinking faild: %u\n",
626 wszBackup, pDupFile->wszPath, GetLastError());
627 return 8;
628 }
629 rcExit = 1;
630 }
631 }
632 else
633 {
634 fprintf(stderr, "kDeDup: error: failed to rename '%ls' to '%ls': %u\n",
635 pDupFile->wszPath, wszBackup, GetLastError());
636 rcExit = 1;
637 }
638 }
639 else
640 {
641 fprintf(stderr, "kDeDup: error: too long backup path: '%ls'\n", pDupFile->wszPath);
642 rcExit = 1;
643 }
644 }
645 }
646 /*
647 * Since the list is sorted by uDev, we now change the target file.
648 */
649 else
650 pTargetFile = pDupFile;
651 }
652 }
653 return rcExit;
654}
655
656
657static int usage(const char *pszName, FILE *pOut)
658{
659 fprintf(pOut,
660 "usage: %s [options] <path1> [path2 [..]]\n"
661 "usage: %s <-V|--version>\n"
662 "usage: %s <-h|--help>\n"
663 , pszName, pszName, pszName);
664 fprintf(pOut,
665 "\n"
666 "Options:\n"
667 " -H, --dereference-command-line, --no-dereference-command-line\n"
668 " Follow symbolic links on the command line.\n"
669 " -L, --dereference\n"
670 " Follow symbolic links while scanning directories.\n"
671 " -P, --no-dereference\n"
672 " Do not follow symbolic links while scanning directories.\n"
673 " -r, --recursive\n"
674 " Recurse into subdirectories, but do not follow links to them.\n"
675 " -R, --recursive-dereference\n"
676 " Same as -r, but also follow into symlinked subdirectories.\n"
677 " -x, --one-file-system\n"
678 " Do not consider other file system (volumes), either down thru a\n"
679 " mount point or via a symbolic link to a directory.\n"
680 " --no-one-file-system, --cross-file-systems\n"
681 " Reverses the effect of --one-file-system.\n"
682 " -q, --quiet, -v,--verbose\n"
683 " Controls the output level.\n"
684 " --hardlink-duplicates\n"
685 " Hardlink duplicate files to remove duplicates and save space. By default\n"
686 " no action is taken and only analysis is done.\n"
687 );
688 return 0;
689}
690
691
692int wmain(int argc, wchar_t **argv)
693{
694 int rcExit;
695
696 /*
697 * Process parameters. Position.
698 */
699 wchar_t **papwszFtsArgs = (wchar_t **)calloc(argc + 1, sizeof(wchar_t *));
700 unsigned cFtsArgs = 0;
701 unsigned fFtsOptions = FTS_NOCHDIR | FTS_NO_ANSI;
702 KBOOL fEndOfOptions = K_FALSE;
703 KBOOL fHardlinkDups = K_FALSE;
704 int i;
705 for (i = 1; i < argc; i++)
706 {
707 wchar_t *pwszArg = argv[i];
708 if ( *pwszArg == '-'
709 && !fEndOfOptions)
710 {
711 wchar_t wcOpt = *++pwszArg;
712 pwszArg++;
713 if (wcOpt == '-')
714 {
715 /* Translate long options. */
716 if (wcscmp(pwszArg, L"help") == 0)
717 wcOpt = 'h';
718 else if (wcscmp(pwszArg, L"version") == 0)
719 wcOpt = 'V';
720 else if (wcscmp(pwszArg, L"recursive") == 0)
721 wcOpt = 'r';
722 else if (wcscmp(pwszArg, L"dereference-recursive") == 0)
723 wcOpt = 'R';
724 else if (wcscmp(pwszArg, L"dereference") == 0)
725 wcOpt = 'L';
726 else if (wcscmp(pwszArg, L"dereference-command-line") == 0)
727 wcOpt = 'H';
728 else if (wcscmp(pwszArg, L"one-file-system") == 0)
729 wcOpt = 'x';
730 /* Process long options. */
731 else if (*pwszArg == '\0')
732 {
733 fEndOfOptions = K_TRUE;
734 continue;
735 }
736 else if (wcscmp(pwszArg, L"no-recursive") == 0)
737 {
738 g_fRecursive = g_fRecursiveViaSymlinks = K_FALSE;
739 continue;
740 }
741 else if (wcscmp(pwszArg, L"no-dereference-command-line") == 0)
742 {
743 fFtsOptions &= ~FTS_COMFOLLOW;
744 continue;
745 }
746 else if ( wcscmp(pwszArg, L"no-one-file-system") == 0
747 || wcscmp(pwszArg, L"cross-file-systems") == 0)
748 {
749 fFtsOptions &= ~FTS_XDEV;
750 continue;
751 }
752 else if (wcscmp(pwszArg, L"hardlink-duplicates") == 0)
753 {
754 fHardlinkDups = K_TRUE;
755 continue;
756 }
757 else
758 {
759 fprintf(stderr, "kDeDup: syntax error: Unknown option '--%ls'\n", pwszArg);
760 return 2;
761 }
762 }
763
764 /* Process one or more short options. */
765 do
766 {
767 switch (wcOpt)
768 {
769 case 'r': /* --recursive */
770 g_fRecursive = K_TRUE;
771 break;
772
773 case 'R': /* --dereference-recursive */
774 g_fRecursive = g_fRecursiveViaSymlinks = K_TRUE;
775 break;
776
777 case 'H': /* --dereference-command-line */
778 fFtsOptions |= FTS_COMFOLLOW;
779 break;
780
781 case 'L': /* --dereference*/
782 g_fFollowSymlinkedFiles = K_TRUE;
783 break;
784
785 case 'x': /* --one-file-system*/
786 fFtsOptions |= FTS_XDEV;
787 break;
788
789 case 'q':
790 g_cVerbosity = 0;
791 break;
792
793 case 'v':
794 g_cVerbosity++;
795 break;
796
797
798 case 'h':
799 case '?':
800 return usage("kDeDup", stdout);
801
802 case 'V':
803 printf("0.0.1\n");
804 return 0;
805
806 default:
807 fprintf(stderr, "kDeDup: syntax error: Unknown option '-%lc'\n", wcOpt);
808 return 2;
809 }
810
811 wcOpt = *pwszArg++;
812 } while (wcOpt != '\0');
813 }
814 else
815 {
816 /*
817 * Append non-option arguments to the FTS argument vector.
818 */
819 papwszFtsArgs[cFtsArgs] = pwszArg;
820 cFtsArgs++;
821 }
822 }
823
824 /*
825 * Do the FTS processing.
826 */
827 kDupSizeTree_Init(&g_SizeRoot);
828 rcExit = kDupReadAll(papwszFtsArgs, fFtsOptions);
829 if (rcExit == 0)
830 {
831 /*
832 * Display the result.
833 */
834 printf("Found %" KU64_PRI " duplicate files, out which %" KU64_PRI " can be hardlinked saving %" KU64_PRI " bytes\n",
835 g_cDuplicates, g_cDuplicatesSaved, g_cbDuplicatesSaved);
836
837 if (fHardlinkDups)
838 rcExit = kDupHardlinkDuplicates();
839 }
840
841 return rcExit;
842}
843
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette