#!/usr/bin/env python
"""
Searches and lists (optionally removing) duplicate files in specified directories.
If directories are not specified, then the PATH environment variable is examined.
"""

"""
HISTORY:


v 0.1  Sometime in 2001	- Initial release
v 0.2  06/08/2002 - check if directories searched are not really the same directory
v 0.3  29/09/2002 - handle files with spaces in them.  add force option to keep only
the latest version
v 0.4  13/07/2004 - treat links as normal files too, so that masking links get detected
					print number of duplicate files they'll have to go through
					Class-ified
2005-03-12 19:05:41 Alejandro Dubrovsky <alito@organicrobot.com>
	* survive failure to delete file
	* switch to optparse commandline parsing
	* added ignore file option

2005-04-09 18:08:38 Alejandro Dubrovsky <alito@organicrobot.com>
	* point out if duplicated directories point to each other

2005-04-20 13:06:10 Alejandro Dubrovsky <alito@organicrobot.com>
	* ignore directories specified in the ignore file too

2006-07-15 18:25:14 Alejandro Dubrovsky <alito@organicrobot.com>
	* sort list of duplicates alphabetically

2011-05-08 Alejandro Dubrovsky <alito@organicrobot.com>
	* Remove portage dependency
	* Modernise code a little bit

"""

import sys,os
import time
import logging
from collections import defaultdict

__author__ = 'Alejandro Dubrovsky'

DefaultIgnoreFile = os.path.expanduser("~/.rmdups/ignore")

class FileInfo(object):
	def __init__(self,name,size,ctime,realpath = ''):
		self.name = name
		self.size = size
		self.ctime = ctime
		self.realpath = realpath
		
	def formatInfo(self):
		return self.name + '\t' + str(self.size) + '\t' + time.ctime(self.ctime)
	

def confirm(message='', write=sys.stdout.write, read=sys.stdin.readline):
	input = ''
	while input not in ['y','n']:
		write(message)				
		input = read().strip()[:1].lower()
	return input == 'y'

	
def confirmDelete(filename):
	message =  'Delete %s (y/n)?' % filename
	deleted = False
	if confirm(message):
		try:
			os.remove(filename)
			deleted = True
		except (IOError, OSError), e:
			logging.error('could not remove %s: %s' % (filename,e))

	return deleted


def constructPathList(paths):
	pathList = []
	inodes = set()
	#add a directory to the searched path.  check if it's not already in list
	# or if it happens to be the same directory as one on the list
	for p in paths:
		if not p in pathList:
			if os.path.isdir(p):
				inode = os.stat(p).st_ino
				if inode not in inodes:
					pathList.append(p)
					inodes.add(inode)
			else:
				logging.warn('%s does not exist or is not a directory' % p)

	return pathList


def readFile(filename):
	"""
	Return list of filenames/directories.  Skip entries #commented out
	"""
	try:
		fin = open(filename)
	except (OSError, IOError), e:
		logging.error("Could not open %s for reading: %s" % (filename, e))
		return []

	entries = []
	for line in fin:
		if line.startswith("#"): continue
		if len(line.strip()) == 0: continue
		entries.append(line.strip())

	return entries

	
def main(args):	
	from optparse import OptionParser

	parser = OptionParser('Usage:  rmdups [-l|-f] [-v] [-h] [-i ignorefile] [directories]', epilog=__doc__)
	parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true",help="Be verbose")
	parser.add_option("-l", "--listonly", dest="listonly", default=False, action="store_true",
					  help="list only (do not remove any files)")
	parser.add_option("-f", "--force", dest="force", default=False, action="store_true",
					  help="keep latest version, remove all others.")
	parser.add_option("-i", "--ignorefile", dest="ignoreFile", default=DefaultIgnoreFile, action="store",
					  help="file containing list of files and directories to ignore when searching for duplicates.  (default: %default")
	
	options, rest = parser.parse_args(args)

	dos = os.path
	trouble = {}
	allfiles = {}
	troubledirs = {}
	alldirs = {}
	pointsto = defaultdict(list)
	dirpointsto = defaultdict(list)
	ignore = set()  # files and directories to ignore
	badlinks = []
	
	if options.verbose:
		logging.basicConfig(level=logging.DEBUG)
	else:
		logging.basicConfig(level=logging.WARN)

	if options.force and options.listonly:
		options.force = False
		logging.warn('-f and -l should not be used together.  Getting rid of -f for safety reasons')

		
	path = constructPathList(rest)
	
	if len(path) == 0:
		#if no directories were listed, add all the directories in the PATH environment
		#variable, making sure to add only unique directories.
		path = constructPathList(os.environ['PATH'].split(':'))

	#load up list of files to ignore
	if options.ignoreFile:
		ignorelist = readFile(os.path.expanduser(options.ignoreFile))
		for i in ignorelist:
			ignore.add(i)

	path = [directory for directory in path if directory not in ignore]
	
	for directory in path:
		for each in os.listdir(directory):
			if each in ignore:
				continue
			f = os.path.join(directory,each)
			thestats = os.lstat(f)
			#info is the vital info kept for each file, name size and modtime
			info = FileInfo(f, thestats.st_size,thestats.st_mtime)
			
			#keep track of dangling links
			if dos.islink(f):
				isalink = 1
				if not dos.exists(f):
					badlinks.append(info)
			else:
				isalink = 0
				
			
			if dos.isfile(f):
				name = dos.basename(f)
				info.realpath = dos.realpath(f)
				#if it's a link, keep track of which file it points to
				if isalink:
					whereto = info.realpath
					pointsto[whereto].append(f)
						
				#keep a list of all tracked files, keep duplicates in trouble
				if trouble.has_key(name) and (not info.realpath in [f.realpath for f in trouble[name]]):
					trouble[name].append(info)
				elif allfiles.has_key(name) and (info.realpath != allfiles[name].realpath):
					trouble[name] = [info,allfiles[name]]
				else:
					allfiles[name] = info
			elif dos.isdir(f):
				#keep track of duplicated directories too
				info.realpath = dos.realpath(f)
				name = dos.basename(f)
				if isalink:
					whereto = info.realpath
					dirpointsto[whereto].append(f)
						
				if troubledirs.has_key(name):
					troubledirs[name].append(info)
				elif alldirs.has_key(name):
					troubledirs[name] = [info,alldirs[name]]
				else:
					alldirs[name] = info
			#path.remove(directory)
	
	if len(badlinks) > 0:
		print 'Links to nowhere:'
		for link in badlinks:
			filename = link.name
			whereto = os.readlink(filename)
			print link.formatInfo() + ' -> ' + whereto
			if not options.listonly:
				confirmDelete(filename)
				
	troubledFiles = trouble.values()
	troubledFiles.sort(key=lambda x: x[0].name)
	if troubledFiles:
		print 'Duplicated files (%d instances):' % (len(troubledFiles),)
		for files in troubledFiles:
			for f in files:
				print f.formatInfo()
				filename = f.name
				for link in pointsto[filename]:
					print 'is pointed to by ' + link
				if dos.islink(filename):
					whereto = os.readlink(filename)
					print ' points to %s ' % (whereto,)
			if not options.listonly:
				if options.force:
					maxtime = 0
					which = -1
					for t in range(len(files)):
						if files[t].ctime > maxtime:
							which = t
							maxtime = files[t].ctime
					if which >= 0:
						files[which:which+1] = []
						for f in files:
							logging.info('Removing ' + f.formatInfo())
							try:
								os.remove(f.name)
							except (IOError, OSError), e:
								logging.error('Could not remove %s: %s' % (f.name,e))
						
				else:
					for f in files:
						filename = f.name
						confirmDelete(filename)
						for link in pointsto[filename]:
							confirmDelete(link)
			print

	if troubledirs:
		print 'Duplicated directories:'		
		for dir in troubledirs.values():
			for f in dir:						
				print f.formatInfo()
				if f.name in dirpointsto:
					print "pointed to by %s" % ",".join(d for d in dirpointsto[f.name])
			print
	
if __name__ == '__main__':
	sys.exit(main(sys.argv[1:]))
	
	
