# identf.sh v.0.3. 揪出兩目錄下之相同檔案之 bash 腳本
## 此功能是做為另一支腳本 serf.sh 的搭配使用,也可單獨使用; 但都需使用到 sha1sum。或也可衍生成比對兩份檔案內之 unique ids。事實上它雖無法取代或對等那些目錄比對軟體,但此腳本仍有其存在及加成的價值。細節內文詳,並跑跑看便知所以。
## 0.3版
``` sh=
#!/bin/bash
# v.0.3. 20230416. ken woo. copyleft.
# dig out identical files from Left- and Right- sides via sha1sum(or say modified into via any form of unique id; e.g., add size, time).
# ./this_script [options] Left-src-file Right-src-file assigned-left-output assigned-right-output. note the checksums must be sorted.
# items format in the source files: "full-path-filename checksum". e.g., /home/ken/tmp/a.txt c48022dfb82dd8edddd664bd684962d1bfc90db4
# to sort checksums: sed 's/\(.*\) \([a-f0-9A-F]\+$\)/\2 \1/' in.txt | sort -k1,1 | sed 's/\(^[a-f0-9A-F]\+\) \(.*\)/\2 \1/' > out.txt
# or via sha1sum-generated file: sort -k1,1 in.txt | sed -n 's/\(^[a-f0-9A-F]\+\) \(.*\)/\2 \1/p' > out.txt which becomes the source file.
# options: -a, -b, -c, -d.
# -a: precedes each identical group with a line of sequential number; [[i]].
# -b: not only -a but also ahead each item the next sequential number.
# -c: suppose for example, L has 1 line identical to 3 lines in R, then stuffing with additional 2 lines "this-checksum" in L; where
# is better for 2-way comparison applications/e.g., "meld".
# -d: enable the output of prefixing "-" of all that are not matched. and it is easy to manually separate later.
# note: 1) if the item format or pattern needs to change, just treat these left-assignment of $left and $right lines.
# 2) checksum identical does not exactly mean file contents identical which must be aware of. conversely exactly diff if diff.
# final release the v.0.2 if luckily no bugs and forbidden myself any new revised ideas.
# v.0.2 revised: minor usage addemdum and added extra info for output and added an option -d since it seems essential.
# example: Left-src & Right-src and Left-output & Right-output are as below by options -abc,
# Left-src Right-src Left-output Right-output
#
# /a/b 123 /i/j 456 [[1]] [[1]]
# /c/d 456 /k/l 789 [[1]] /c/d 456 [[1]] /i/j 456
# /e/f 456 /m/n 789 [[2]] /e/f 456 [[2]] 456
# /g/h 789 /o/p 789
# [[3]] [[3]]
# [[3]] /g/h 789 [[3]] /k/l 789
# [[4]] 789 [[4]] /m/n 789
# [[5]] 789 [[5]] /o/p 789
IDENTF_VER="Bash 4.3+ script. IDENTF version 0.3"
opt_a="0"
opt_b="0"
opt_c="0"
opt_d="0"
myopts=""
seql=1
seqr=1
cnt=0
isdone="1"
identical_lr=""
while getopts abcd opt; do
case "$opt" in
a )
myopts="${myopts}a"
opt_a="1";;
b )
myopts="${myopts}b"
opt_a="1"
opt_b="1";;
c )
myopts="${myopts}c"
opt_c="1";;
d )
myopts="${myopts}d"
opt_d="1";;
* )
echo "wrong options"
exit 1;;
esac
done
shift $[ $OPTIND - 1 ]
[[ $# -ne 4 ]] && echo "wrong parameters" && exit 1;
exec 3< $1
exec 4< $2
exec 5> $3
exec 6> $4
function auxout() {
local -n theseq=$1
echo -n "[[$theseq]] "
(( theseq=$theseq + 1 ))
return 0
}
function auxNavL() {
[[ $opt_d = "1" ]] && echo "-$1" >&5
return 0
}
function auxNavR() {
[[ $opt_d = "1" ]] && echo "-$1" >&6
return 0
}
### ----------
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SCRIPT_NAME=$( basename -- "${BASH_SOURCE[0]}" )
inipath=$( pwd )
echo -e "\nscript version: $IDENTF_VER\n"
echo "cmd: $SCRIPT_DIR/$SCRIPT_NAME '-$myopts' '$1' '$2' '$3' '$4'"
echo "pwd: $inipath"
echo; echo $( date ); echo;
elapse_time_b=$SECONDS
### ----------
while read -r ctx1 <&3; do
left=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx1" )
[[ ${#left} -eq 0 ]] && continue
if [[ "$identical_lr" = "$left" ]]; then
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$ctx1" >&5
[[ $opt_c = "1" ]] && (( cnt=$cnt + 1 ))
continue
fi
while read -r ctx2 <&4; do
isdone="0"
right=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx2" )
[[ ${#right} -eq 0 ]] && continue
if [[ "$identical_lr" = "$right" ]]; then
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$ctx2" >&6
if [[ $opt_c = "1" ]]; then
if [[ $cnt -gt 0 ]]; then
(( cnt=$cnt - 1 ))
else
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$right" >&5
fi
fi
continue
else
[[ $opt_c = "1" ]] && while [[ $cnt -gt 0 ]]; do
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$identical_lr" >&6
(( cnt=$cnt - 1 ))
done
identical_lr=""
fi
[[ "$left" < "$right" ]] && $( auxNavL "$ctx1" ) && while read -r ctx1_1 <&3; do
left=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx1_1" )
[[ ${#left} -eq 0 ]] && continue
[[ "$left" < "$right" ]] && $( auxNavL "$ctx1_1" ) && continue
ctx1="$ctx1_1"
break
done
[[ "$left" > "$right" ]] && $( auxNavR "$ctx2" ) && continue
identical_lr="$left"
if [[ $opt_a = "1" ]]; then
if [[ $opt_b = "1" ]]; then
echo -e "\n[[$seql]]" >&5
auxout seql >&5
echo "$ctx1" >&5
echo -e "\n[[$seqr]]" >&6
auxout seqr >&6
echo "$ctx2" >&6
else
$( echo -e "\n[[$seql]]\n$ctx1" >&5 ) && $( echo -e "\n[[$seql]]\n$ctx2" >&6 ) && (( seql=$seql + 1 ))
fi
else $( echo "$ctx1" >&5 ) && $( echo "$ctx2" >&6 )
fi
break
done
if [[ $isdone = "1" ]]; then
[[ $opt_d = "1" ]] && $( echo "-$ctx1" >&5 ) || break
else isdone="1"
fi
done
# treat the case when L ran out first; as well as the case while L ends up with multiple identical items.
[[ ${#identical_lr} -ne 0 ]] && while read -r ctx2 <&4; do
right=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx2" )
[[ ${#right} -eq 0 ]] && continue
if [[ "$identical_lr" = "$right" ]]; then
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$ctx2" >&6
if [[ $opt_c = "1" ]]; then
if [[ $cnt -gt 0 ]]; then
(( cnt=$cnt - 1 ))
else
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$right" >&5
fi
fi
continue
else
[[ $opt_c = "1" ]] && while [[ $cnt -gt 0 ]]; do
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$identical_lr" >&6
(( cnt=$cnt - 1 ))
done
break
fi
done
# and for option -d.
[[ $opt_d = "1" ]] && while read -r ctx2 <&4; do
echo "-$ctx2" >&6
done
exec 3<&-; exec 4<&-; exec 5>&-; exec 6>&-;
### ----------
echo; echo $( date ); echo;
(( time_elapsed=$SECONDS-$elapse_time_b ));
echo -e "\nit took $(( $time_elapsed / 60 )) minute(s) $(( $time_elapsed % 60 )) seconds\n";
### ----------
# end of sh
```
### 0.2版
``` sh=
#!/bin/bash
# v.0.2. 20230402. ken woo. copyleft.
# dig out identical files from Left- and Right- sides via sha1sum(or say modified into via any form of unique id; e.g., add size, time).
# ./this_script [options] Left-src-file Right-src-file assigned-left-output assigned-right-output. note the checksums must be sorted.
# items format in the source files: "full-path-filename checksum". e.g., /home/ken/tmp/a.txt c48022dfb82dd8edddd664bd684962d1bfc90db4
# to sort checksums: sed 's/\(.*\) \([a-f0-9A-F]\+$\)/\2 \1/' in.txt | sort -k1 | sed 's/\(^[a-f0-9A-F]\+\) \(.*\)/\2 \1/' > out.txt
# options: -a, -b, -c.
# -a: precedes each identical group with a line of sequential number; [[i]].
# -b: not only -a but also ahead each item the next sequential number.
# -c: suppose for example, L has 1 line identical to 3 lines in R, then stuffing with additional 2 lines "this-checksum" in L; where
# is better for 2-way comparison applications/e.g., "meld".
# note: 1) if the item format or pattern needs to change, just treat the left-assignment of $left and $right lines.
# 2) checksum identical does not exactly mean file contents identical which must be aware of.
# final release the v.0.2 if luckily no bugs and forbidden myself any new revised ideas.
opt_a="0"
opt_b="0"
opt_c="0"
seql=1
seqr=1
cnt=0
identical_lr=""
while getopts abc opt; do
case "$opt" in
a )
opt_a="1";;
b )
opt_a="1"
opt_b="1";;
c )
opt_c="1";;
* )
echo "wrong options"
exit 1;;
esac
done
shift $[ $OPTIND - 1 ]
[[ $# -ne 4 ]] && echo "wrong parameters" && exit 1;
exec 3< $1
exec 4< $2
exec 5> $3
exec 6> $4
function auxout() {
local -n theseq=$1
echo -n "[[$theseq]] "
(( theseq=$theseq + 1 ))
return 0
}
while read -r ctx1 <&3; do
left=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx1" )
[[ ${#left} -eq 0 ]] && continue
if [[ "$identical_lr" = "$left" ]]; then
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$ctx1" >&5
[[ $opt_c = "1" ]] && (( cnt=$cnt + 1 ))
continue
fi
while read -r ctx2 <&4; do
right=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx2" )
[[ ${#right} -eq 0 ]] && continue
if [[ "$identical_lr" = "$right" ]]; then
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$ctx2" >&6
if [[ $opt_c = "1" ]]; then
if [[ $cnt -gt 0 ]]; then
(( cnt=$cnt - 1 ))
else
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$right" >&5
fi
fi
continue
else
[[ $opt_c = "1" ]] && while [[ $cnt -gt 0 ]]; do
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$identical_lr" >&6
(( cnt=$cnt - 1 ))
done
identical_lr=""
fi
[[ "$left" < "$right" ]] && while read -r ctx1_1 <&3; do
left=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx1_1" )
[[ ${#left} -eq 0 ]] && continue
[[ "$left" < "$right" ]] && continue
ctx1="$ctx1_1"
break
done
[[ "$left" > "$right" ]] && continue
if [[ $opt_a = "1" ]]; then
if [[ $opt_b = "1" ]]; then
echo -e "\n[[$seql]]" >&5
auxout seql >&5
echo "$ctx1" >&5
echo -e "\n[[$seqr]]" >&6
auxout seqr >&6
echo "$ctx2" >&6
identical_lr="$left"
else
echo -e "\n[[$seql]]\n$ctx1" >&5 && echo -e "\n[[$seql]]\n$ctx2" >&6 && identical_lr="$left" && (( seql=$seql + 1 ))
fi
else
echo "$ctx1" >&5 && echo "$ctx2" >&6 && identical_lr="$left"
fi
break
done
done
# treat the case when L ran out first; as well as the case while L ends up with multiple identical items.
[[ ${#identical_lr} -ne 0 ]] && while read -r ctx2 <&4; do
right=$( sed 's/.* \([0-9a-fA-F]\+$\)/\1/' <<< "$ctx2" )
[[ ${#right} -eq 0 ]] && continue
if [[ "$identical_lr" = "$right" ]]; then
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$ctx2" >&6
if [[ $opt_c = "1" ]]; then
if [[ $cnt -gt 0 ]]; then
(( cnt=$cnt - 1 ))
else
[[ $opt_b = "1" ]] && auxout seql >&5
echo "$right" >&5
fi
fi
continue
else
[[ $opt_c = "1" ]] && while [[ $cnt -gt 0 ]]; do
[[ $opt_b = "1" ]] && auxout seqr >&6
echo "$identical_lr" >&6
(( cnt=$cnt - 1 ))
done
break
fi
done
exec 3<&-; exec 4<&-; exec 5>&-; exec 6>&-;
# end of sh
```