#!/bin/sh
# We find all files in path, feed them into ls -l with xargs,
# and sort them on the size column.
# We can't depend on ls' own sort when using xargs since enough
# files will end up splitting between several ls calls.
# Then we read the lines in order, and check for duplicate sizes.
find /path/to/dir -type f -print0 | xargs --null ls -l | sort -k 5,6 |
while read PERMS LINKS USER GROUP SIZE M D Y FILE
do
# Skip symbolic links
[ -h "$FILE" ] && continue
if [ "$SIZE" -eq "$LASTSIZE" ]
then
echo "$FILE same size as $LASTFILE"
else
LASTSIZE="$SIZE" ; LASTFILE="$FILE"
fi
# find will spew errors when it can't access a file, so ignore /dev/null.
done 2> /dev/null
---------- Post updated at 05:35 PM ---------- Previous update was at 04:43 PM ----------
Here's an improved version that checks checksums. It can churn through about 4 gigs of random files in 7 seconds, uncached, on my not-so-great system.
The trick is, it only checks checksums against files of the same size, and does a quick checksum on their first 512 bytes to filter out files that're obviously different. Maybe the first 16K, or first 256K would be better.
Code:
#!/bin/bash
TMP=$(mktemp)
# Given a list of files of the same size, "$TMP",
# it will check which ones have the same checksums.
function checkgroup
{
local FILE
local LASTSUM
local LASTFILE
[ -s "$TMP" ] || return
# Check first 512 bytes of files.
# If that differs, who cares about the rest?
while read FILE
do
SUM=$(dd count=1 < "$FILE" | md5sum)
read G SUM <<<"$SUM"
echo "$SUM $FILE"
done < "$TMP" | sort | while read SUM FILE
do
if [ "$LASTSUM" != "$SUM" ]
then
LASTSUM="$SUM"
LASTFILE="$FILE"
UNPRINTED=1
continue
fi
[ -z "$UNPRINTED" ] || echo "$LASTFILE"
UNPRINTED=""
echo "$FILE"
done | xargs -d '\n' md5sum | sort |
while read SUM FILE
do
if [ "$SUM" != "$LASTSUM" ]
then
LASTSUM="$SUM"
LASTFILE="$FILE"
else
echo "$FILE == $LASTFILE"
fi
done
}
# Find all files, feed them through ls, sort them on size.
# Can't depend on ls' own sorting when there's too man files,
# it could be run more than once.
# Once we have the output, loop through looking for files
# the same size and make a list to feed into checkgroup.
find ~/public_html -type f | xargs ls -l | sort -k 5,6 |
while read PERMS LINKS USER GROUP SIZE M D Y FILE
do
# Skip symbolic links
[ -h "$FILE" ] && continue
if [ "$SIZE" -eq "$LASTSIZE" ]
then
[ -s "$TMP" ] || echo "$LASTFILE" > "$TMP"
echo "$FILE" >> "$TMP"
else
checkgroup "$LASTSIZE"
LASTSIZE="$SIZE" ; LASTFILE="$FILE"
:>"$TMP"
fi
done
checkgroup
rm -f "$TMP"
hi all,
in my server there are some specific application files which are spread through out the server... these are spread in folders..sub-folders..chid folders...
please help me, how can i find the total size of these specific files in the server... (3 Replies)
Hi,
Could anyone help me to solve this problem?
I have two files "f1" and "f2" having 2 fields in each, a) file size and b) file name. The data are almost same in both the files except for few and new additional lines. Now, I have to find out and print the output as, the difference in the... (3 Replies)
hi,
Please help me to write a command to delete duplicate lines from a file. And the size of file is 50 MB. How to remove duplicate lins from such a big file. (6 Replies)
Hi !
I wonder if anyone can help on this : I have a directory: /xyz that has the following files:
chsLog.107.20130603.gz
chsLog.115.20130603
chsLog.111.20130603.gz
chsLog.107.20130603
chsLog.115.20130603.gz
As you ca see there are two files that are the same but only with a minor... (10 Replies)
Hi champs,
I have one of the requirement, where I need to compare two files line by line and ignore duplicates. Note, I hav files in sorted order.
I have tried using the comm command, but its not working for my scenario.
Input file1
srv1..development..employee..empname,empid,empdesg... (1 Reply)
Hello,
I have a huge directory (with millions of files) and need to find out duplicates based on BOTH file name and File size.
I know fdupes but it calculates MD5 which is very time-consuming and especially it takes forever as I have millions of files.
Can anyone please suggest a script or... (7 Replies)