Skip to content

Commit

Permalink
Automatized cleanup by prefer and skip-manual
Browse files Browse the repository at this point in the history
  • Loading branch information
peteraba committed Sep 14, 2017
1 parent 4a39ed0 commit f6c2aaf
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 92 deletions.
29 changes: 16 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,26 @@ When duplicates are found, it can provide an option to delete one of the them.

How it works:
1. It scans the directory structure under `root` and groups them by filesize.
2. It loops through the created group sizes and hashes all files in a group with the same size.
3. When it finds files with the same non-zero filesize and the same hash, it identifies them as being the same.
4. When run with the `fix` option, it will offer to keep only one of the same files, otherwise it just lists duplicates
2. It loops through each group and tries to decide if they are the same byhashing the first 1KB of each file and collects group of files with the same size and same first 1KB of data.
3. At this point it can do different things, depending on the options:
1. It can simply list the files which seem to be the same
2. It can offer deleting files by group
3. It can check if there's only one file matching a regular expression (prefer), and keep only that automatically.
4. If skip-manual is provided, groups without a preferred file found will be skipped.


```
Usage:
# Display help
dblfinder -h | --help
# Display version number
dblfinder -v | --version
# Find duplicates recursively in the directory provided
dblfinder --help
dblfinder --version
dblfinder [--fix] [--limit=<n>] [--verbose] <root>
Options:
-h --help display help
-v --version display version number
--fix try to fix issues, not only list them
--limit=<n> limit the maximum number of duplicates to fix [default: 0]
--verbose provide verbose output
--help display help
--version display version number
--verbose provide verbose output
--fix try to fix issues, not only list them
--prefer=<s> prefer path if it matches regexp defined here
--skip-manual skip decisions if prefer did not find anything
--limit=<n> limit the maximum number of duplicates to fix [default: 0]
```
192 changes: 113 additions & 79 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,38 +6,72 @@ package main

import (
"crypto/md5"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"strconv"

docopt "github.com/docopt/docopt-go"
"regexp"
)

const name = "dblfinder"
const version = "0.2.0"
const version = "0.3.0"
const usage = `
Dblfinder provides a command-line tool for finding duplicated files.
When duplicates are found, it can provide an option to delete one of the them.
Usage:
dblfinder -h | --help
dblfinder -v | --version
dblfinder --help
dblfinder --version
dblfinder [--fix] [--limit=<n>] [--verbose] <root>
Options:
-h --help display help
-v --version display version number
--fix try to fix issues, not only list them
--limit=<n> limit the maximum number of duplicates to fix [default: 0]
--verbose provide verbose output
--help display help
--version display version number
--verbose provide verbose output
--fix try to fix issues, not only list them
--prefer=<s> prefer path if it matches regexp defined here
--skip-manual skip decisions if prefer did not find anything
--limit=<n> limit the maximum number of duplicates to fix [default: 0]
`

func getFlags() (bool, int, bool, string, string, bool) {
var (
showHelp, showVersion, verbose, fix, skipManual bool
limit int
prefer, root string
)

flag.BoolVar(&showHelp, "help", false, "display help")
flag.BoolVar(&showVersion, "version", false, "display the version number")
flag.BoolVar(&verbose, "verbose", false, "provide verbose output")
flag.IntVar(&limit, "limit", 0, "limit the maximum number of duplicates to fix")
flag.BoolVar(&fix, "fix", false, "try to fix issues, not only list them")
flag.StringVar(&prefer, "prefer", "", "limit the maximum number of duplicates to fix")
flag.BoolVar(&skipManual, "skip-manual", false, "skip decisions if prefer did not find anything")

flag.Parse()

root = flag.Arg(0)

if showHelp {
flag.PrintDefaults()
os.Exit(0)
}

if showVersion {
fmt.Println(version)
os.Exit(0)
}

return fix, limit, verbose, root, prefer, skipManual
}

func main() {
fix, limit, verbose, root, err := getFlags()
fix, limit, verbose, root, prefer, skipManual := getFlags()

if root == "" {
fmt.Printf("No root is provided", err)
fmt.Printf("No root is provided")
return
}

Expand All @@ -46,14 +80,14 @@ func main() {
fmt.Printf("filepath.Walk() returned an error: %v\n", err)
return
} else {
fmt.Printf("Visited at least %d files\n", len(filesizes))
fmt.Printf("Found %d unique filenames\n", len(filesizes))
}

sameSizeFiles, count := filterSameSizeFiles(filesizes)
if count > 0 {
fmt.Printf("%d files were be hashed\n", count)
fmt.Printf("%d files need to be hashed:\n", count)
} else {
fmt.Printf("No files were be hashed\n")
fmt.Printf("No files need to be hashed\n")
return
}

Expand All @@ -66,50 +100,12 @@ func main() {
}

if fix {
cleanUp(sameHashFiles, limit)
cleanUp(sameHashFiles, prefer, skipManual)
} else {
listAll(sameHashFiles, limit)
}
}

func getFlags() (bool, int, bool, string, error) {
var (
fix bool
limit int
verbose bool
root string
rawLimit string
limit64 int64
)

arguments, err := docopt.Parse(usage, nil, true, fmt.Sprintf("%s %s", name, version), false)
if err != nil {
return fix, limit, verbose, root, err
}

if arguments["fix"] != nil {
fix = true
}
if arguments["verbose"] != nil {
verbose = true
}
if arguments["limit"] != nil {
rawLimit = arguments["limit"].(string)
}
root = arguments["<root>"].(string)

if rawLimit != "" {
limit64, err = strconv.ParseInt(rawLimit, 10, 64)

limit = int(limit64)
if limit < 0 {
limit = 0
}
}

return fix, limit, verbose, root, nil
}

// getAllFilesizes scans the root directory recursively and returns the path of each file found
func getAllFilesizes(root string) (map[int64][]string, error) {
filesizes := make(map[int64][]string)
Expand Down Expand Up @@ -148,7 +144,7 @@ func filterSameSizeFiles(filesizes map[int64][]string) (map[int64][]string, int)
return sameSizeFiles, count
}

// filterSameHashFiles removes strings from a sameSizeFiles map all files that have a unique md5 hash
// filterSameHashFiles removes strings from a sameSizeFiles, and map all files that have a unique md5 hash
func filterSameHashFiles(sameSizeFiles map[int64][]string, limit int, verbose bool) ([][]string, int) {
sameHashFiles := [][]string{}
count := 0
Expand All @@ -160,6 +156,10 @@ func filterSameHashFiles(sameSizeFiles map[int64][]string, limit int, verbose bo
break
}

if verbose {
fmt.Printf("Hashing files: %v\n", files)
}

uniqueHashes := getUniqueHashes(files, verbose)

for _, paths := range uniqueHashes {
Expand Down Expand Up @@ -196,13 +196,20 @@ func hashWorker(path string, md5s chan *md5ToHash, verbose bool) {
fmt.Printf("About to read \"%s\"\n", path)
}

data, err := ioutil.ReadFile(path)
f, err := os.Open(path)
if err != nil {
if verbose {
fmt.Printf("Reading data for \"%s\" failed.\n", path)
}
md5s <- &md5ToHash{path, "", err}
return
log.Fatal(err)
}

data := make([]byte, 1024)

_, err = f.Read(data)
if err != nil {
log.Fatal(err)
}

if err := f.Close(); err != nil {
log.Fatal(err)
}

h := md5.New()
Expand All @@ -213,6 +220,9 @@ func hashWorker(path string, md5s chan *md5ToHash, verbose bool) {

if verbose {
fmt.Printf("Calculated md5 of \"%s\".\n", path)
} else {
fmt.Print(".")

}

md5s <- &md5ToHash{path, string(sum), nil}
Expand Down Expand Up @@ -254,28 +264,52 @@ func getHashResults(md5s chan *md5ToHash, max int) map[string][]string {
// cleanUp deletes all, but one instance of the same file
// number of kept file is read from standard input (count starts from 1)
// number zero returned will skip file deletion
// os part is done in deleteAllFilesButI
func cleanUp(sameSizeFiles [][]string, limit int) {
for key, files := range sameSizeFiles {
if limit > 0 && key >= limit {
fmt.Println("Cleanup limit is reached.")
break
}
// os part is done in deleteOtherFiles
func cleanUp(sameSizeFiles [][]string, prefer string, skipManual bool) {
var (
preferRegexp *regexp.Regexp
keep int
)

if prefer != "" {
preferRegexp = regexp.MustCompile(prefer)
}

for _, files := range sameSizeFiles {
fmt.Println("The following files are the same:")

keep = 0
for key, file := range files {
fmt.Printf("[%d] %s\n", key+1, file)

if preferRegexp == nil || keep < 0 || !preferRegexp.MatchString(file) {
continue
}

// We found more than one preferred file here...
if keep > 0 {
keep = -1
continue
}

keep = key + 1
}

if keep < 1 && skipManual {
fmt.Printf("Preferred file not found, deletion skipped.\n\n")
continue
}

i := readInt(len(files))
for keep < 1 || keep > len(files) {
keep = readInt(len(files))
}

if i == 0 {
if keep == 0 {
fmt.Printf("Deletion skipped.\n\n")
} else {
fmt.Printf("Deleting all, but `%s`.\n", files[i-1])
} else if keep > 0 {
fmt.Printf("Deleting all, but `%s`.\n", files[keep-1])

deleteAllFilesButI(files, i)
deleteOtherFiles(files, keep)

fmt.Printf("\n\n")
}
Expand Down Expand Up @@ -303,9 +337,9 @@ func readInt(max int) int {
return i
}

// deleteAllFilesButI deletes a list of files, except for the i.-th file, counting from 1
func deleteAllFilesButI(files []string, i int) {
delFiles := append(files[:i-1], files[i:]...)
// deleteOtherFiles deletes a list of files, except for the i.-th file, counting from 1
func deleteOtherFiles(files []string, keep int) {
delFiles := append(files[:keep-1], files[keep:]...)

for _, file := range delFiles {
fmt.Printf("Removing: %s\n", file)
Expand All @@ -322,7 +356,7 @@ func deleteAllFilesButI(files []string, i int) {
// cleanUp deletes all, but one instance of the same file
// number of kept file is read from standard input (count starts from 1)
// number zero returned will skip file deletion
// os part is done in deleteAllFilesButI
// os part is done in deleteOtherFiles
func listAll(sameSizeFiles [][]string, limit int) {
for key, files := range sameSizeFiles {
if limit > 0 && key >= limit {
Expand Down

0 comments on commit f6c2aaf

Please sign in to comment.