#!/usr/bin/python """ Searches and lists (optionally removing) duplicate files in specified directories. If directories are not specified, then the PATH environment variable is examined. Requires python 2.3 or greater or 2.2 and optparse module HISTORY: v 0.1 Sometime in 2001 - Initial release v 0.2 06/08/2002 - check if directories searched are not really the same directory v 0.3 29/09/2002 - handle files with spaces in them. add force option to keep only the latest version v 0.4 13/07/2004 - treat links as normal files too, so that masking links get detected print number of duplicate files they'll have to go through Class-ified 2005-03-12 19:05:41 Alejandro Dubrovsky * survive failure to delete file * switch to optparse commandline parsing * added ignore file option 2005-04-09 18:08:38 Alejandro Dubrovsky * point out if duplicated directories point to each other 2005-04-20 13:06:10 Alejandro Dubrovsky * ignore directories specified in the ignore file too 2006-07-15 18:25:14 Alejandro Dubrovsky * sort list of duplicates alphabetically """ import sys,os import os.path import time try: import portage_util except ImportError: print >> sys.stderr, "could not import portage_util. probably no portage around" sys.exit(1) __author__ = 'Alejandro Dubrovsky' class FileInfo(object): def __init__(self,name,size,ctime,realpath = ''): self.name = name self.size = size self.ctime = ctime self.realpath = realpath def formatInfo(self): return self.name + '\t' + str(self.size) + '\t' + time.ctime(self.ctime) def confirmDelete(filename): input = '' while input not in ['y','n']: write('Delete ' + filename + ' (y/n)?') input = read()[:-1].lower() if input == 'y': try: os.remove(filename) except: print >> sys.stderr, 'could not remove %s' % filename def main(): DEFAULT_IGNORE_FILE="%s/.rmdups/ignore" % os.environ['HOME'] dos = os.path trouble = {} allfiles = {} troubledirs = {} alldirs = {} pointsto = {} dirpointsto = {} ignore = {} badlinks = [] path = [] inodes = [] from optparse import OptionParser parser = OptionParser('Usage: rmdups [-l|-f] [-v] [-h] [-i ignorefile] [directories]') parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true",help="field to print") parser.add_option("-l", "--listonly", dest="listonly", default=False, action="store_true", help="list only (do not remove any files)") parser.add_option("-f", "--force", dest="force", default=False, action="store_true", help="keep latest version, remove all others.") parser.add_option("-i", "--ignorefile", dest="ignoreFile", default=DEFAULT_IGNORE_FILE, action="store", help="file containing list of files and directories to ignore when searching for duplicates. Defaults to '%s'" % DEFAULT_IGNORE_FILE) (options, args) = parser.parse_args() for p in args: #add a directory to the searched path. check if it's not already in list # or if it happens to be the same directory as one on the list if not p in path: if dos.isdir(p): inode = os.stat(p).st_ino if inode not in inodes: path.append(p) inodes.append(inode) elif options.verbose: print p + ' does not exist or is not a directory' if options.force and options.listonly: options.force = False print '-f and -l should not be used together. Getting rid of -f for safety reasons' if len(path) == 0: #if no directories were listed, add all the directories in the PATH environment #variable, making sure to add only unique directories. for p in os.environ['PATH'].split(':'): if not p in path: if dos.isdir(p): inode = os.stat(p).st_ino if inode not in inodes: path.append(p) inodes.append(inode) elif options.verbose: print p + ' does not exist or is not a directory' #load up list of files to ignore if options.ignoreFile: ignorelist = portage_util.grabfile(options.ignoreFile) for i in ignorelist: ignore[i] = True path = [directory for directory in path if not ignore.get(directory,False)] for directory in path: for each in os.listdir(directory): if each in ignore: continue f = directory + '/' + each thestats = os.lstat(f) #info is the vital info kept for each file, name size and modtime info = FileInfo(f, thestats.st_size,thestats.st_mtime) #keep track of dangling links if dos.islink(f): isalink = 1 if not dos.exists(f): badlinks.append(info) else: isalink = 0 if dos.isfile(f): name = dos.basename(f) info.realpath = dos.realpath(f) #if it's a link, keep track of which file it points to if isalink: whereto = info.realpath if pointsto.has_key(whereto): pointsto[whereto].append(f) else: pointsto[whereto] = [f] #keep a list of all tracked files, keep duplicates in trouble if trouble.has_key(name) and (not info.realpath in [f.realpath for f in trouble[name]]): trouble[name].append(info) elif allfiles.has_key(name) and (info.realpath != allfiles[name].realpath): trouble[name] = [info,allfiles[name]] else: allfiles[name] = info elif dos.isdir(f): #keep track of duplicated directories too info.realpath = dos.realpath(f) name = dos.basename(f) if isalink: whereto = info.realpath if dirpointsto.has_key(whereto): dirpointsto[whereto].append(f) else: dirpointsto[whereto] = [f] if troubledirs.has_key(name): troubledirs[name].append(info) elif alldirs.has_key(name): troubledirs[name] = [info,alldirs[name]] else: alldirs[name] = info #path.remove(directory) if len(badlinks) > 0: print 'Links to nowhere:' for link in badlinks: filename = link.name whereto = os.readlink(filename) print link.formatInfo() + ' -> ' + whereto if not options.listonly: confirmDelete(filename) troubledFiles = trouble.values() troubledFiles.sort(key=lambda x: x[0].name) if troubledFiles: print 'Duplicated files (%d instances):' % (len(troubledFiles),) for files in troubledFiles: for f in files: print f.formatInfo() filename = f.name for link in pointsto.get(filename,[]): print 'is pointed to by ' + link if dos.islink(filename): whereto = os.readlink(filename) print ' points to %s ' % (whereto,) if not options.listonly: if options.force: maxtime = 0 which = -1 for t in range(len(files)): if files[t].ctime > maxtime: which = t maxtime = files[t].ctime if which >= 0: files[which:which+1] = [] for f in files: if options.verbose: print 'Removing ' + f.formatInfo() try: os.remove(f.name) except: print >> sys.stderr, 'could not remove %s' % f.name else: for f in files: filename = f.name confirmDelete(filename) for link in pointsto.get(filename,[]): confirmDelete(link) print if troubledirs: print 'Duplicated directories:' for dir in troubledirs.values(): for f in dir: print f.formatInfo() if f.name in dirpointsto: print "pointed to by %s" % ",".join([d for d in dirpointsto[f.name]]) print if __name__ == '__main__': write = sys.stdout.write read = sys.stdin.readline main()